ニュースを取得するAPIを整備
This commit is contained in:
parent
5aa426f8f6
commit
a390658907
6
.gitignore
vendored
6
.gitignore
vendored
@ -1,3 +1,7 @@
|
|||||||
|
data
|
||||||
|
*service_accout.json
|
||||||
|
*sa.json
|
||||||
|
|
||||||
# ---> Python
|
# ---> Python
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
@ -15,8 +19,6 @@ dist/
|
|||||||
downloads/
|
downloads/
|
||||||
eggs/
|
eggs/
|
||||||
.eggs/
|
.eggs/
|
||||||
lib/
|
|
||||||
lib64/
|
|
||||||
parts/
|
parts/
|
||||||
sdist/
|
sdist/
|
||||||
var/
|
var/
|
||||||
|
|||||||
@ -1,36 +0,0 @@
|
|||||||
# prefect-template/docker-compose.yml
|
|
||||||
services:
|
|
||||||
server:
|
|
||||||
image: prefecthq/prefect:2-latest
|
|
||||||
container_name: prefect-server
|
|
||||||
command: ["prefect","server","start","--host","0.0.0.0"]
|
|
||||||
ports: ["4200:4200"] # UI: http://localhost:4200
|
|
||||||
environment:
|
|
||||||
PREFECT_UI_URL: "http://localhost:4200"
|
|
||||||
PREFECT_API_URL: "http://server:4200/api"
|
|
||||||
TZ: "Asia/Tokyo"
|
|
||||||
# Slack通知を使う場合、.env で SLACK_WEBHOOK_URL を設定
|
|
||||||
SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL:-}
|
|
||||||
volumes:
|
|
||||||
- ./src/flows:/opt/flows
|
|
||||||
- prefect-data:/root/.prefect
|
|
||||||
|
|
||||||
worker:
|
|
||||||
image: prefecthq/prefect:2-latest
|
|
||||||
container_name: prefect-worker
|
|
||||||
depends_on: [server]
|
|
||||||
environment:
|
|
||||||
PREFECT_API_URL: "http://server:4200/api"
|
|
||||||
TZ: "Asia/Tokyo"
|
|
||||||
SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL:-}
|
|
||||||
volumes:
|
|
||||||
- ./src/flows:/opt/flows
|
|
||||||
command: >
|
|
||||||
bash -lc "
|
|
||||||
pip install -r /opt/flows/requirements.txt >/dev/null 2>&1 || true &&
|
|
||||||
prefect work-pool create process-pool -t process || true &&
|
|
||||||
prefect worker start -p process-pool
|
|
||||||
"
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
prefect-data:
|
|
||||||
49
docs/how_to_use.md
Normal file
49
docs/how_to_use.md
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# prefectの使い方
|
||||||
|
|
||||||
|
Flow関数の中で、Prefectの @task が付いた関数を呼び出すことで、処理単位(タスク)を組み合わせて実行します。
|
||||||
|
|
||||||
|
|
||||||
|
## コンポ―ネート
|
||||||
|
|
||||||
|
### @flow
|
||||||
|
|
||||||
|
Prefectにおける「ワークフロー(全体の処理のまとまり)」
|
||||||
|
を定義するデコレータです。
|
||||||
|
|
||||||
|
Pythonの関数を「フロー関数(Flow Function)」に変えます。
|
||||||
|
|
||||||
|
```py
|
||||||
|
@flow
|
||||||
|
def etl_flow(d: str | None = None):
|
||||||
|
d = d or date.today().isoformat()
|
||||||
|
load(transform(extract(d)))
|
||||||
|
```
|
||||||
|
|
||||||
|
タスクの呼び出しががわかりにくいので分解すると以下になる
|
||||||
|
|
||||||
|
```py
|
||||||
|
@flow
|
||||||
|
def etl_flow(d=None):
|
||||||
|
d = d or date.today().isoformat()
|
||||||
|
# load(transform(extract(d)))
|
||||||
|
raw = extract(d)
|
||||||
|
clean = transform(raw)
|
||||||
|
load(clean)
|
||||||
|
```
|
||||||
|
|
||||||
|
### @task
|
||||||
|
|
||||||
|
Prefectが管理する個々の処理単位(タスク)を定義します。
|
||||||
|
通常のPython関数にリトライやログ管理、
|
||||||
|
依存関係管理などを付けられる。
|
||||||
|
|
||||||
|
```py
|
||||||
|
@task(retries=3, retry_delay_seconds=10)
|
||||||
|
def extract(d):
|
||||||
|
return f"raw({d})"
|
||||||
|
```
|
||||||
|
|
||||||
|
* retries:
|
||||||
|
* 最大3回リトライ
|
||||||
|
* retry_delay_seconds:
|
||||||
|
* 失敗したら10秒待って再試行という「実行単位」
|
||||||
14
examples/example_csv.py
Normal file
14
examples/example_csv.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
import sys, os
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src")))
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
load_dotenv("../.env")
|
||||||
|
|
||||||
|
from lib.csv_collector import CSVWriter
|
||||||
|
|
||||||
|
|
||||||
|
from lib.custom_logger import get_logger
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
logger.info("Starting CSV example script")
|
||||||
|
|
||||||
@ -1,2 +1,8 @@
|
|||||||
requests
|
requests
|
||||||
prefect
|
python-dotenv
|
||||||
|
prefect
|
||||||
|
|
||||||
|
pandas==2.3.2
|
||||||
|
|
||||||
|
duckdb==1.3.2
|
||||||
|
google-cloud-storage
|
||||||
96
src/flows/api_etl_flow.py
Normal file
96
src/flows/api_etl_flow.py
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
import sys, os
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
load_dotenv("../.env")
|
||||||
|
|
||||||
|
from prefect import flow, task,get_run_logger
|
||||||
|
from typing import Optional
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from models.csv_model_base import CSVBaseModel
|
||||||
|
from providers.api_g_news import ApiGNews
|
||||||
|
from providers.google_cloud_storage_provider import GoogleCloudStorageProvider
|
||||||
|
from lib.csv_collector import CSVWriter
|
||||||
|
|
||||||
|
|
||||||
|
class NewsData(CSVBaseModel):
|
||||||
|
title:str
|
||||||
|
url:str=""
|
||||||
|
description: Optional[str] = None
|
||||||
|
content: Optional[str] = None
|
||||||
|
image_url: Optional[str] = None
|
||||||
|
description: Optional[str] = None
|
||||||
|
content: Optional[str] = None
|
||||||
|
category:str="politics"
|
||||||
|
source_name:str=""
|
||||||
|
language: Optional[str] = "ja" # "ja", "en" など
|
||||||
|
country: Optional[str] = "jp" # "jp" など
|
||||||
|
pub_date: Optional[str] = None # "2023-10-01 12:00:00" など
|
||||||
|
|
||||||
|
|
||||||
|
@task(retries=2, retry_delay_seconds=10)
|
||||||
|
def call_api() -> list[dict]:
|
||||||
|
logger = get_run_logger()
|
||||||
|
logger.info("Starting API ETL Flow")
|
||||||
|
rets = ApiGNews.get_top_headlines(
|
||||||
|
category="nation",
|
||||||
|
lang="ja",
|
||||||
|
country="jp",
|
||||||
|
query="政治",
|
||||||
|
)
|
||||||
|
logger.info(f"Fetched {len(rets)} articles from GNews API")
|
||||||
|
return rets
|
||||||
|
|
||||||
|
@task()
|
||||||
|
def format_to_model(ret:list[dict]) -> list[NewsData]:
|
||||||
|
"""APIレスポンスをモデルに変換"""
|
||||||
|
logger = get_run_logger()
|
||||||
|
logger.info("Formatting API response to NewsData models")
|
||||||
|
models = []
|
||||||
|
for item in ret:
|
||||||
|
model = NewsData(
|
||||||
|
title=item['title'],
|
||||||
|
url=item['url'],
|
||||||
|
description=item.get('description',None),
|
||||||
|
content=item.get('content',None),
|
||||||
|
image_url=item.get('image',None),
|
||||||
|
pub_date=item.get('publishedAt',None),
|
||||||
|
category="politics",
|
||||||
|
source_name=item.get('source',{}).get('name',""),
|
||||||
|
language="ja",
|
||||||
|
country="jp",
|
||||||
|
)
|
||||||
|
models.append(model)
|
||||||
|
logger.info(f"Formatted {len(models)} NewsData models")
|
||||||
|
return models
|
||||||
|
|
||||||
|
@task()
|
||||||
|
def write_csv(models:list[NewsData]):
|
||||||
|
logger = get_run_logger()
|
||||||
|
logger.info("write_csv API response to NewsData models")
|
||||||
|
csv_data = NewsData.to_csv_from_items(models)
|
||||||
|
dt = datetime.now()
|
||||||
|
dt_str = dt.strftime("%Y-%m-%d")
|
||||||
|
file_name = f"news_{dt_str}_part-001.csv"
|
||||||
|
prefix = f"data_science/data/y={dt.strftime('%Y')}/news"
|
||||||
|
provider = GoogleCloudStorageProvider()
|
||||||
|
bucket_name = os.getenv("GCS_BUCKET_NAME")
|
||||||
|
provider.write_csv_item(
|
||||||
|
bucket=bucket_name,
|
||||||
|
object_name=f"{prefix}/{file_name}",
|
||||||
|
records=csv_data,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@flow
|
||||||
|
def api_etl_flow():
|
||||||
|
# E: API呼び出し及びモデルに変換
|
||||||
|
ret = call_api()
|
||||||
|
models = format_to_model(ret)
|
||||||
|
# Load: CSV書き出し
|
||||||
|
write_csv(models)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
api_etl_flow()
|
||||||
12
src/lib/csv_collector/__init__.py
Normal file
12
src/lib/csv_collector/__init__.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from .csv_writer import CSVWriter
|
||||||
|
from .csv_reader import CSVReader
|
||||||
|
from .csv_editor import CSVEditColumn,CSVEditMapper
|
||||||
|
from .csv_analyzer import CSVAnalyzer
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"CSVWriter",
|
||||||
|
"CSVReader",
|
||||||
|
"CSVEditColumn",
|
||||||
|
"CSVEditMapper",
|
||||||
|
"CSVAnalyzer",
|
||||||
|
]
|
||||||
118
src/lib/csv_collector/csv_analyzer.py
Normal file
118
src/lib/csv_collector/csv_analyzer.py
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
from zoneinfo import ZoneInfo
|
||||||
|
from typing import Union
|
||||||
|
from utils.types import DataLayer
|
||||||
|
|
||||||
|
from lib.custom_logger import get_logger
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
from .csv_writer import CSVWriter
|
||||||
|
from .csv_reader import CSVReader
|
||||||
|
|
||||||
|
class CSVAnalyzer:
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _separate_month_to_df(
|
||||||
|
cls,
|
||||||
|
header: list,
|
||||||
|
data_rows: list,
|
||||||
|
date_key: str = "published_at",
|
||||||
|
tz: str | None = None) -> pd.DataFrame | None:
|
||||||
|
|
||||||
|
if not data_rows:
|
||||||
|
return None
|
||||||
|
|
||||||
|
df = pd.DataFrame(data_rows, columns=header)
|
||||||
|
# 日付のデータ列を加工する(datetime型に変換,タイムゾーン変換)
|
||||||
|
df[date_key] = pd.to_datetime(df[date_key], errors="coerce", utc=True)
|
||||||
|
if tz:
|
||||||
|
df[date_key] = df[date_key].dt.tz_convert(ZoneInfo(tz))
|
||||||
|
# 年月列を追加
|
||||||
|
df["year_month"] = df[date_key].dt.to_period("M")
|
||||||
|
# 7) グループごとにdictリストへ
|
||||||
|
return df
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def separate_month_to_dict(
|
||||||
|
cls,
|
||||||
|
header: list,
|
||||||
|
data_rows: list,
|
||||||
|
date_key: str = "published_at",
|
||||||
|
tz: str | None = None) -> dict[str, list[dict]] | None:
|
||||||
|
"""
|
||||||
|
年月ごとにデータを分割する(list of list形式-> dict of list of dict形式)
|
||||||
|
"""
|
||||||
|
df = cls._separate_month_to_df(header, data_rows, date_key, tz)
|
||||||
|
if df is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return {
|
||||||
|
str(ym): g.drop(columns=["year_month"]).to_dict(orient="records")
|
||||||
|
for ym, g in df.groupby("year_month", sort=True)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def write_separated_month(
|
||||||
|
cls,
|
||||||
|
records,
|
||||||
|
domain: str,
|
||||||
|
event: str,
|
||||||
|
layer:Union[str, DataLayer],
|
||||||
|
prefix: str = None,
|
||||||
|
data_format: str = "%Y-%m",
|
||||||
|
is_year: bool=True,
|
||||||
|
is_month: bool=True,
|
||||||
|
data_key: str = "published_at",
|
||||||
|
tz: str | None = None,
|
||||||
|
):
|
||||||
|
"""年月ごとにデータを分割してCSVファイルに保存する"""
|
||||||
|
if not records or len(records) < 2:
|
||||||
|
logger.warning("No records to process.")
|
||||||
|
return
|
||||||
|
header = records[0]
|
||||||
|
data_rows = records[1:]
|
||||||
|
|
||||||
|
df = cls._separate_month_to_df(header, data_rows, data_key, tz)
|
||||||
|
if df is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
for ym, g in df.groupby("year_month", sort=True):
|
||||||
|
logger.info(f"Processing year-month: {ym}")
|
||||||
|
y, m = str(ym).split("-")
|
||||||
|
folder_path = CSVWriter.get_filepath(
|
||||||
|
domain=domain,
|
||||||
|
layer=layer)
|
||||||
|
if is_year:
|
||||||
|
folder_path = f"{folder_path}/y={y}"
|
||||||
|
if is_month:
|
||||||
|
folder_path = f"{folder_path}/m={m}"
|
||||||
|
|
||||||
|
filename = CSVWriter.get_filename(
|
||||||
|
event=event,
|
||||||
|
prefix=prefix,
|
||||||
|
date_format=data_format,
|
||||||
|
dt=str(ym) + "-01",
|
||||||
|
extension=".csv"
|
||||||
|
)
|
||||||
|
fpath = os.path.join(folder_path, filename)
|
||||||
|
os.makedirs(folder_path, exist_ok=True)
|
||||||
|
logger.info(f"Writing to file: {fpath}")
|
||||||
|
g.drop(columns=["year_month"]).to_csv(fpath, index=False, encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# result = {}
|
||||||
|
# for year_month, group in df.groupby('year_month'):
|
||||||
|
# year = year_month.year
|
||||||
|
# month = year_month.month
|
||||||
|
# logger.info(f"y={year}/m={month:02d}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
110
src/lib/csv_collector/csv_editor.py
Normal file
110
src/lib/csv_collector/csv_editor.py
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
|
||||||
|
# import os
|
||||||
|
# import csv
|
||||||
|
from typing import Optional, TypeVar,Callable
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from .csv_reader import CSVReader
|
||||||
|
|
||||||
|
|
||||||
|
from lib.custom_logger import get_logger
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
T = TypeVar("T")
|
||||||
|
ColCallback = Callable[[int, list, dict], T]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CSVEditColumn():
|
||||||
|
"""CSV編集用の列情報"""
|
||||||
|
name: str
|
||||||
|
value: any = None
|
||||||
|
key_name: str = None
|
||||||
|
cb: Optional[ColCallback] = None
|
||||||
|
|
||||||
|
def execute(self, row_index: int, row: list, header_map: dict) -> any:
|
||||||
|
"""値を取得する"""
|
||||||
|
try:
|
||||||
|
if self.cb:
|
||||||
|
return self.cb(row_index, row, header_map)
|
||||||
|
elif self.key_name and self.key_name in header_map:
|
||||||
|
index = header_map[self.key_name]
|
||||||
|
return row[index]
|
||||||
|
else:
|
||||||
|
return self.value
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in CSVEditColumn.execute: {e}")
|
||||||
|
logger.error(f"row_index: {row_index}, row: {row}, header_map: {header_map}")
|
||||||
|
logger.error(f"Column info - name: {self.name}, value: {self.value}, key_name: {self.key_name}, cb: {self.cb}")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
class CSVEditMapper:
|
||||||
|
"""CSV編集用のマッパー"""
|
||||||
|
def __init__(self, header_map: dict = None):
|
||||||
|
self.columns: list[CSVEditColumn] = []
|
||||||
|
self.header_map: dict = header_map if header_map else {}
|
||||||
|
|
||||||
|
def add(self, column: CSVEditColumn):
|
||||||
|
self.columns.append(column)
|
||||||
|
|
||||||
|
def add_column(self, name: str, key_name: str = None):
|
||||||
|
if not key_name:
|
||||||
|
key_name = name
|
||||||
|
self.columns.append(CSVEditColumn(name, None, key_name))
|
||||||
|
|
||||||
|
def add_value(self, name: str, value: any):
|
||||||
|
self.columns.append(CSVEditColumn(name, value))
|
||||||
|
|
||||||
|
def add_callback(self, name: str, cb: callable):
|
||||||
|
self.columns.append(CSVEditColumn(name, cb=cb))
|
||||||
|
|
||||||
|
def auto_columns(self):
|
||||||
|
"""既存のヘッダー情報から自動的に列を追加する"""
|
||||||
|
if not self.header_map or len(self.header_map) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
# 自動的に追加するが順番はインデックス順
|
||||||
|
sorted_items = sorted(self.header_map.items(), key=lambda item: item[1])
|
||||||
|
for key, idx in sorted_items:
|
||||||
|
self.add_column(name=key, key_name=key)
|
||||||
|
|
||||||
|
def get_column_values(self,key_name:str,row,null_value:any=None) -> any:
|
||||||
|
idx = self.header_map[key_name]
|
||||||
|
if idx is None or idx < 0:
|
||||||
|
return null_value
|
||||||
|
|
||||||
|
return row[idx]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def edit(self, records: list[list]) -> list[list]:
|
||||||
|
"""CSVデータを編集する"""
|
||||||
|
new_records = []
|
||||||
|
# ヘッダー行を追加する
|
||||||
|
header = []
|
||||||
|
for col in self.columns:
|
||||||
|
header.append(col.name)
|
||||||
|
new_records.append(header)
|
||||||
|
if not records or len(records) < 2:
|
||||||
|
return new_records
|
||||||
|
|
||||||
|
if self.header_map is None or len(self.header_map) == 0:
|
||||||
|
self.header_map = CSVReader.header_map(records[0])
|
||||||
|
|
||||||
|
# データ加工を実行する
|
||||||
|
for i,rows in enumerate(records[1:]):
|
||||||
|
new_row = []
|
||||||
|
for col in self.columns:
|
||||||
|
_value = col.execute(i, rows, self.header_map)
|
||||||
|
new_row.append(_value)
|
||||||
|
new_records.append(new_row)
|
||||||
|
|
||||||
|
return new_records
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
39
src/lib/csv_collector/csv_reader.py
Normal file
39
src/lib/csv_collector/csv_reader.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
import os
|
||||||
|
import csv
|
||||||
|
from typing import List,Union
|
||||||
|
from datetime import datetime
|
||||||
|
from utils.types import DataLayer
|
||||||
|
|
||||||
|
from lib.custom_logger import get_logger
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
class CSVReader:
|
||||||
|
"""CSVファイル書き込みユーティリティ"""
|
||||||
|
BASE_DIR = "data"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def read(cls, file_path: str) -> List[any]:
|
||||||
|
"""CSVファイルを配列として読み込む"""
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
logger.warning(f"File not found: {file_path}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
with open(file_path, mode="r", newline="", encoding="utf-8") as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
return list(reader)
|
||||||
|
|
||||||
|
|
||||||
|
def read_dict(cls, file_path: str) -> List[dict]:
|
||||||
|
"""CSVファイルを読み込む(辞書型)"""
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
logger.warning(f"File not found: {file_path}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
with open(file_path, mode="r", newline="", encoding="utf-8") as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
return list(reader)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def header_map(cls, headers: list) -> dict[str,int]:
|
||||||
|
"""CSV配列のヘッダー情報よりマッピング辞書を生成"""
|
||||||
|
return {h: i for i, h in enumerate(headers)}
|
||||||
178
src/lib/csv_collector/csv_writer.py
Normal file
178
src/lib/csv_collector/csv_writer.py
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
import os
|
||||||
|
import csv
|
||||||
|
from typing import List,Union
|
||||||
|
from datetime import datetime
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
|
|
||||||
|
from utils.types import DataLayer
|
||||||
|
|
||||||
|
from lib.custom_logger import get_logger
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class CSVWriter:
|
||||||
|
"""CSVファイル書き込みユーティリティ"""
|
||||||
|
BASE_DIR = "data"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_filepath(cls,
|
||||||
|
domain: str,
|
||||||
|
layer:Union[str, DataLayer,None],
|
||||||
|
is_year: bool=False,
|
||||||
|
is_month: bool=False,
|
||||||
|
is_day: bool=False,
|
||||||
|
is_hour: bool=False,
|
||||||
|
dt: Union[str,datetime]=None
|
||||||
|
) -> str:
|
||||||
|
"""フォルダパスを生成する"""
|
||||||
|
parts = [cls.BASE_DIR]
|
||||||
|
parts.append(domain)
|
||||||
|
if layer:
|
||||||
|
parts.append(layer)
|
||||||
|
if dt is None:
|
||||||
|
dt = datetime.now()
|
||||||
|
elif isinstance(dt, str):
|
||||||
|
dt = datetime.fromisoformat(dt)
|
||||||
|
if is_year:
|
||||||
|
parts.append(f"y={dt.strftime('%Y')}")
|
||||||
|
if is_month:
|
||||||
|
parts.append(f"m={dt.strftime('%m')}")
|
||||||
|
if is_day:
|
||||||
|
parts.append(f"d={dt.strftime('%d')}")
|
||||||
|
if is_hour:
|
||||||
|
parts.append(f"h={dt.strftime('%H')}")
|
||||||
|
folder_path = os.path.join(*parts)
|
||||||
|
logger.debug(f"Generated CSV folder path: {folder_path}")
|
||||||
|
return os.path.join(*parts)
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_filename(
|
||||||
|
cls,
|
||||||
|
event: str,
|
||||||
|
prefix: str = None,
|
||||||
|
date_format: str = "%Y-%m-%d",
|
||||||
|
dt: Union[str,datetime] = None,
|
||||||
|
part: int = None,
|
||||||
|
extension: str = ".csv") -> str:
|
||||||
|
"""
|
||||||
|
CSVファイルのパスを生成
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prefix (str, optional): ファイル名の接頭辞. Defaults to None.
|
||||||
|
date_format (str, optional): 日付フォーマット. Defaults to None. 例: "%Y-%m-%d"
|
||||||
|
dt (datetime, optional): 日付情報. Defaults to None.
|
||||||
|
part (int, optional): パーティション番号. Defaults to None.
|
||||||
|
extension (str, optional): ファイル拡張子. Defaults to ".csv".
|
||||||
|
"""
|
||||||
|
file_names_part = []
|
||||||
|
if prefix:
|
||||||
|
file_names_part.append(prefix)
|
||||||
|
file_names_part.append(event)
|
||||||
|
|
||||||
|
if date_format:
|
||||||
|
# 日時データに変換
|
||||||
|
if dt is None:
|
||||||
|
dt = datetime.now()
|
||||||
|
elif isinstance(dt, str):
|
||||||
|
dt = datetime.fromisoformat(dt)
|
||||||
|
date_str = dt.strftime(date_format)
|
||||||
|
file_names_part.append(date_str)
|
||||||
|
|
||||||
|
if part is not None:
|
||||||
|
file_names_part.append(f"part-{part:03d}")
|
||||||
|
file_name = "_".join(file_names_part) + extension
|
||||||
|
logger.debug(f"Generated CSV file name: {file_name}")
|
||||||
|
return file_name
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def write(
|
||||||
|
cls,
|
||||||
|
records:List,
|
||||||
|
domain:str,
|
||||||
|
layer:Union[str, DataLayer],
|
||||||
|
event: str,
|
||||||
|
prefix: str = None,
|
||||||
|
date_format: str = "%Y-%m-%d",
|
||||||
|
dt: Union[str,datetime] = None,
|
||||||
|
part: int = None,
|
||||||
|
extension: str = ".csv",
|
||||||
|
is_year: bool=False,
|
||||||
|
is_month: bool=False,
|
||||||
|
is_day: bool=False,
|
||||||
|
is_hour: bool=False,
|
||||||
|
is_update: bool=False,
|
||||||
|
) -> str:
|
||||||
|
"""CSVデータを文字列として生成"""
|
||||||
|
if not records:
|
||||||
|
logger.warning("No records to write.")
|
||||||
|
return ""
|
||||||
|
folder_path = cls.get_filepath(
|
||||||
|
domain=domain,
|
||||||
|
layer=layer,
|
||||||
|
is_year=is_year,
|
||||||
|
is_month=is_month,
|
||||||
|
is_day=is_day,
|
||||||
|
is_hour=is_hour,
|
||||||
|
dt=dt
|
||||||
|
)
|
||||||
|
|
||||||
|
filename = cls.get_filename(
|
||||||
|
event=event,
|
||||||
|
prefix=prefix,
|
||||||
|
date_format=date_format,
|
||||||
|
dt=dt,
|
||||||
|
part=part,
|
||||||
|
extension=extension)
|
||||||
|
|
||||||
|
os.makedirs(folder_path, exist_ok=True)
|
||||||
|
full_filename = os.path.join(folder_path, filename)
|
||||||
|
|
||||||
|
if not is_update and os.path.exists(full_filename):
|
||||||
|
logger.info(f"File already exists and will not be overwritten: {full_filename}")
|
||||||
|
return full_filename
|
||||||
|
|
||||||
|
with open(full_filename, mode="w", newline="", encoding="utf-8") as f:
|
||||||
|
writer = csv.writer(f, quoting=csv.QUOTE_ALL)
|
||||||
|
writer.writerows(records)
|
||||||
|
|
||||||
|
return full_filename
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def write_with_filename(
|
||||||
|
cls,
|
||||||
|
records:List,
|
||||||
|
filename: str,
|
||||||
|
is_update: bool=False,
|
||||||
|
) -> str:
|
||||||
|
"""CSVデータを指定されたファイルパスに書き込む"""
|
||||||
|
if not records:
|
||||||
|
logger.warning("No records to write.")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||||
|
|
||||||
|
if not is_update and os.path.exists(filename):
|
||||||
|
logger.info(f"File already exists and will not be overwritten: {filename}")
|
||||||
|
return filename
|
||||||
|
|
||||||
|
with open(filename, mode="w", newline="", encoding="utf-8") as f:
|
||||||
|
writer = csv.writer(f, quoting=csv.QUOTE_ALL)
|
||||||
|
writer.writerows(records)
|
||||||
|
|
||||||
|
return filename
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def csv_bytes(
|
||||||
|
cls,
|
||||||
|
records:List,
|
||||||
|
) -> bytes:
|
||||||
|
"""CSVデータをバイト列として生成"""
|
||||||
|
buf = StringIO(newline="")
|
||||||
|
writer = csv.writer(buf, quoting=csv.QUOTE_ALL)
|
||||||
|
writer.writerows(records)
|
||||||
|
return buf.getvalue().encode('utf-8')
|
||||||
|
|
||||||
56
src/lib/custom_logger.py
Normal file
56
src/lib/custom_logger.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
import logging
|
||||||
|
import functools
|
||||||
|
from .singleton import Singleton
|
||||||
|
|
||||||
|
class CustomLogger(Singleton):
|
||||||
|
"""
|
||||||
|
Singleton logger class that initializes a logger with a specified name and log file.
|
||||||
|
It provides a method to log entry and exit of functions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, name='main', log_file=None, level=logging.INFO):
|
||||||
|
if hasattr(self, '_initialized') and self._initialized:
|
||||||
|
return # すでに初期化済みなら何もしない
|
||||||
|
# self.logger.setLevel(level)
|
||||||
|
|
||||||
|
self.logger = logging.getLogger(name)
|
||||||
|
self.logger.setLevel(level)
|
||||||
|
self.logger.propagate = False
|
||||||
|
|
||||||
|
formatter = logging.Formatter(
|
||||||
|
'%(asctime)s %(levelname)s [%(filename)s:%(lineno)3d]: %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Console handler
|
||||||
|
ch = logging.StreamHandler()
|
||||||
|
ch.setFormatter(formatter)
|
||||||
|
self.logger.addHandler(ch)
|
||||||
|
|
||||||
|
# File handler
|
||||||
|
if log_file:
|
||||||
|
fh = logging.FileHandler(log_file, encoding='utf-8')
|
||||||
|
fh.setFormatter(formatter)
|
||||||
|
self.logger.addHandler(fh)
|
||||||
|
|
||||||
|
self._initialized = True
|
||||||
|
|
||||||
|
|
||||||
|
def get_logger(self):
|
||||||
|
return self.logger
|
||||||
|
|
||||||
|
def log_entry_exit(self, func):
|
||||||
|
@functools.wraps(func)
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
self.logger.info(f"Enter: {func.__qualname__}")
|
||||||
|
result = func(*args, **kwargs)
|
||||||
|
self.logger.info(f"Exit: {func.__qualname__}")
|
||||||
|
return result
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_logger(name='main', log_file=None, level=logging.INFO):
|
||||||
|
custom_logger = CustomLogger(name, log_file, level)
|
||||||
|
return custom_logger.get_logger()
|
||||||
20
src/lib/singleton.py
Normal file
20
src/lib/singleton.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
"""Singleton pattern implementation in Python.
|
||||||
|
This implementation is thread-safe and ensures that only one instance of the class is created.
|
||||||
|
|
||||||
|
Singleton が提供するのは「同じインスタンスを返す仕組み」
|
||||||
|
* __init__() は毎回呼ばれる(多くの人が意図しない動作)
|
||||||
|
* __init__の2回目は_initialized というフラグは 使う側で管理する必要がある。
|
||||||
|
"""
|
||||||
|
|
||||||
|
import threading
|
||||||
|
|
||||||
|
class Singleton(object):
|
||||||
|
_instances = {}
|
||||||
|
_lock = threading.Lock()
|
||||||
|
|
||||||
|
def __new__(cls, *args, **kwargs):
|
||||||
|
if cls not in cls._instances:
|
||||||
|
with cls._lock:
|
||||||
|
if cls not in cls._instances: # ダブルチェック
|
||||||
|
cls._instances[cls] = super(Singleton, cls).__new__(cls)
|
||||||
|
return cls._instances[cls]
|
||||||
42
src/models/csv_model_base.py
Normal file
42
src/models/csv_model_base.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
from typing import ClassVar, Optional, List
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
class CSVBaseModel(BaseModel):
|
||||||
|
"""BaseModelにCSV用の共通機能を追加した基底クラス"""
|
||||||
|
# クラスごとに除外設定を持てるようにする
|
||||||
|
csv_excludes: ClassVar[List[str]] = []
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def to_headers(cls, excepts: Optional[List[str]] = None) -> List[str]:
|
||||||
|
"""CSVヘッダーを自動生成"""
|
||||||
|
fields = list(cls.model_fields.keys()) # 定義順を保持
|
||||||
|
if excepts:
|
||||||
|
fields = [f for f in fields if f not in excepts]
|
||||||
|
return fields
|
||||||
|
|
||||||
|
def to_row(self, excepts: Optional[List[str]] = None) -> List[str]:
|
||||||
|
"""インスタンスをCSV行データに変換"""
|
||||||
|
header = self.to_headers(excepts=excepts)
|
||||||
|
row = []
|
||||||
|
for f in header:
|
||||||
|
val = getattr(self, f)
|
||||||
|
if isinstance(val, (dict, list)):
|
||||||
|
row.append(json.dumps(val, ensure_ascii=False)) # dictやlistはJSON文字列に
|
||||||
|
elif isinstance(val, datetime):
|
||||||
|
row.append(val.isoformat()) # datetimeはISO8601文字列に
|
||||||
|
elif val is None:
|
||||||
|
row.append("")
|
||||||
|
else:
|
||||||
|
row.append(str(val))
|
||||||
|
return row
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def to_csv_from_items(items: List['CSVBaseModel']) -> List:
|
||||||
|
"""CSV行データをまとめて取得"""
|
||||||
|
if not items:
|
||||||
|
return ""
|
||||||
|
headers = items[0].to_headers()
|
||||||
|
rows = [item.to_row() for item in items]
|
||||||
|
return [headers] + rows
|
||||||
95
src/providers/api_g_news.py
Normal file
95
src/providers/api_g_news.py
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
import requests
|
||||||
|
import os
|
||||||
|
|
||||||
|
from lib.custom_logger import get_logger
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
class ApiGNews:
|
||||||
|
"""
|
||||||
|
GNewsを操作するクラス
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- GNews APIを使用してニュース記事を取得するためのクラス
|
||||||
|
- APIキーは環境変数 `GNEWS_API_KEY` から取得されます
|
||||||
|
- 詳細なAPIドキュメントは https://gnews.io/docs/ を参照してください
|
||||||
|
"""
|
||||||
|
|
||||||
|
GNEWS_API_KEY = os.getenv("GNEWS_API_KEY")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_news(
|
||||||
|
cls,
|
||||||
|
query: str = None,
|
||||||
|
lang: str = "jp", # en,
|
||||||
|
country: str = "jp", # us,
|
||||||
|
max: int = 10,
|
||||||
|
from_at: str = None, # ISO 8601形式の日時文字列 (例: "2023-10-01T00:00:00Z")
|
||||||
|
to_at: str = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
GNewsからニュース記事を取得する
|
||||||
|
Args:
|
||||||
|
query (str): 検索クエリ
|
||||||
|
lang (str): 記事の言語コード (例: "jp" = 日本語)
|
||||||
|
country (str): 国コード (例: "jp" = 日本)
|
||||||
|
max (int): 取得件数の上限 (最大100)
|
||||||
|
from_at (str): 取得開始日時 (ISO 8601形式)
|
||||||
|
to_at (str): 取得終了日時 (ISO 8601形式)
|
||||||
|
"""
|
||||||
|
url = "https://gnews.io/api/v4/search"
|
||||||
|
params = {
|
||||||
|
"apikey": cls.GNEWS_API_KEY,
|
||||||
|
"q": query,
|
||||||
|
"lang": lang,
|
||||||
|
"country": country,
|
||||||
|
"max": max,
|
||||||
|
"from": from_at,
|
||||||
|
"to": to_at,
|
||||||
|
}
|
||||||
|
# None値は送らない
|
||||||
|
params = {k: v for k, v in params.items() if v is not None}
|
||||||
|
response = requests.get(url,params=params)
|
||||||
|
response.raise_for_status()
|
||||||
|
json_data:dict = response.json()
|
||||||
|
logger.debug(f"GNews API Response: {json_data}")
|
||||||
|
return json_data.get("articles", [])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_top_headlines(
|
||||||
|
cls,
|
||||||
|
category: str = None, # business, entertainment, general, health, science, sports, technology
|
||||||
|
lang: str = "jp", # en,
|
||||||
|
country: str = "jp", # us,
|
||||||
|
max: int = 10,
|
||||||
|
from_at: str = None, # ISO 8601形式の日時文字列 (例: "2023-10-01T00:00:00Z")
|
||||||
|
to_at: str = None,
|
||||||
|
query: str = None,
|
||||||
|
):
|
||||||
|
""" GNewsからトップニュース記事を取得する
|
||||||
|
Args:
|
||||||
|
category (str): カテゴリ (business, entertainment, general, health, science, sports, technology)
|
||||||
|
lang (str): 記事の言語コード (例: "jp" = 日本語)
|
||||||
|
country (str): 国コード (例: "jp" = 日本)
|
||||||
|
max (int): 取得件数の上限 (最大100)
|
||||||
|
from_at (str): 取得開始日時 (ISO 8601形式)
|
||||||
|
to_at (str): 取得終了日時 (ISO 8601形式)
|
||||||
|
query (str): 検索クエリ
|
||||||
|
"""
|
||||||
|
url = "https://gnews.io/api/v4/top-headlines"
|
||||||
|
params = {
|
||||||
|
"apikey": cls.GNEWS_API_KEY,
|
||||||
|
"category": category,
|
||||||
|
"lang": lang,
|
||||||
|
"country": country,
|
||||||
|
"max": max,
|
||||||
|
"from": from_at,
|
||||||
|
"to": to_at,
|
||||||
|
"q": query,
|
||||||
|
}
|
||||||
|
# None値は送らない
|
||||||
|
params = {k: v for k, v in params.items() if v is not None}
|
||||||
|
response = requests.get(url,params=params)
|
||||||
|
response.raise_for_status()
|
||||||
|
json_data:dict = response.json()
|
||||||
|
logger.debug(f"GNews API Response: {json_data}")
|
||||||
|
return json_data.get("articles", [])
|
||||||
35
src/providers/duck_db_provider.py
Normal file
35
src/providers/duck_db_provider.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
import duckdb
|
||||||
|
|
||||||
|
class DuckDBProvider:
|
||||||
|
def __init__(self, db_path: str = ":memory:", read_only: bool = False):
|
||||||
|
self.con = self.connect(db_path, read_only)
|
||||||
|
|
||||||
|
def connect(self, db_path: str = ":memory:", read_only: bool = False):
|
||||||
|
return duckdb.connect(database=db_path, read_only=read_only)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
"""接続を閉じる"""
|
||||||
|
if self.con:
|
||||||
|
self.con.close()
|
||||||
|
|
||||||
|
def query_df(self, sql: str):
|
||||||
|
"""SQLクエリを実行してDataFrameで返す"""
|
||||||
|
return self.con.execute(sql).df()
|
||||||
|
|
||||||
|
def max_value(
|
||||||
|
self,
|
||||||
|
file_path: str,
|
||||||
|
column: str,
|
||||||
|
hive_partitioning: bool = True,
|
||||||
|
union_by_name: bool = True,
|
||||||
|
) -> any:
|
||||||
|
"""CSVファイルの指定列の最大値を取得する"""
|
||||||
|
query = f"""
|
||||||
|
SELECT MAX({column}) AS max_{column}
|
||||||
|
FROM read_csv_auto('{file_path}',
|
||||||
|
hive_partitioning={1 if hive_partitioning else 0},
|
||||||
|
union_by_name={1 if union_by_name else 0}
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
result = self.con.execute(query).fetchone()[0]
|
||||||
|
return result
|
||||||
191
src/providers/google_cloud_storage_provider.py
Normal file
191
src/providers/google_cloud_storage_provider.py
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
import os
|
||||||
|
import io
|
||||||
|
from typing import Optional, List, Dict, Any, Union, BinaryIO
|
||||||
|
from datetime import timedelta
|
||||||
|
import mimetypes
|
||||||
|
import csv
|
||||||
|
|
||||||
|
from google.cloud import storage
|
||||||
|
from google.oauth2 import service_account
|
||||||
|
|
||||||
|
from lib.custom_logger import get_logger
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
import zipfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
class GoogleCloudStorageProvider:
|
||||||
|
|
||||||
|
def __init__(self, cred_path: Optional[str] = None, project: Optional[str] = None):
|
||||||
|
try:
|
||||||
|
if cred_path:
|
||||||
|
creds = service_account.Credentials.from_service_account_file(cred_path)
|
||||||
|
# プロジェクト未指定なら credentials から取得
|
||||||
|
effective_project = project or creds.project_id
|
||||||
|
self._client = storage.Client(
|
||||||
|
project=effective_project, credentials=creds
|
||||||
|
)
|
||||||
|
logger.info(f"GCS client initialized with service account file. project={effective_project}")
|
||||||
|
elif os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON"):
|
||||||
|
cred_json = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
||||||
|
creds = service_account.Credentials.from_service_account_info(cred_json)
|
||||||
|
effective_project = project or creds.project_id
|
||||||
|
self._client = storage.Client(
|
||||||
|
project=effective_project, credentials=creds
|
||||||
|
)
|
||||||
|
logger.info("GCS client initialized with credentials from environment variable.")
|
||||||
|
else:
|
||||||
|
self._client = storage.Client(project=project)
|
||||||
|
logger.info("GCS client initialized with default credentials (ADC).")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"GCS initialization failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Private methods to get bucket and blob references
|
||||||
|
def _bucket(self, bucket: str) -> storage.Bucket:
|
||||||
|
return self._client.bucket(bucket)
|
||||||
|
|
||||||
|
def _blob(self, bucket: str, object_name: str) -> storage.Blob:
|
||||||
|
return self._bucket(bucket).blob(object_name)
|
||||||
|
|
||||||
|
|
||||||
|
# バケット操作
|
||||||
|
def get_buckets(self) -> List[str]:
|
||||||
|
buckets: List[storage.Bucket] = self._client.list_buckets()
|
||||||
|
return [b.name for b in buckets]
|
||||||
|
|
||||||
|
def create_bucket(self, bucket_name: str, location: str = "ASIA-NORTHEAST1", storage_class: str = "STANDARD"):
|
||||||
|
b = storage.Bucket(self._client, name=bucket_name)
|
||||||
|
b.storage_class = storage_class
|
||||||
|
return self._client.create_bucket(b, location=location)
|
||||||
|
|
||||||
|
def is_exists_bucket(self, bucket_name: str) -> bool:
|
||||||
|
try:
|
||||||
|
self._client.get_bucket(bucket_name)
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# オブジェクト操作
|
||||||
|
def get_items(self, bucket: str, prefix: str | None = None, match_glob:str | None=None) -> List[Dict[str, Any]]:
|
||||||
|
items: List[storage.Blob] = self._client.list_blobs(bucket, prefix=prefix,match_glob=match_glob)
|
||||||
|
return [{"name": bl.name, "size": bl.size, "updated": bl.updated, "content_type": bl.content_type}
|
||||||
|
for bl in items]
|
||||||
|
|
||||||
|
def is_exists_item(self, bucket: str, object_name: str) -> bool:
|
||||||
|
return self._blob(bucket, object_name).exists()
|
||||||
|
|
||||||
|
|
||||||
|
def write_item(self, bucket: str, object_name: str, data: Union[bytes, BinaryIO, str],
|
||||||
|
content_type: str | None = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
オブジェクトを書き込む
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bucket (str): バケット名
|
||||||
|
object_name (str): オブジェクト名
|
||||||
|
data (Union[bytes, BinaryIO, str]): 書き込むデータ
|
||||||
|
content_type (Optional[str]): コンテンツタイプ(MIMEタイプ)
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: 書き込んだオブジェクトの情報
|
||||||
|
"""
|
||||||
|
blob = self._blob(bucket, object_name)
|
||||||
|
if content_type is None:
|
||||||
|
content_type = mimetypes.guess_type(object_name)[0] or "application/octet-stream"
|
||||||
|
blob.content_type = content_type
|
||||||
|
|
||||||
|
if isinstance(data, (bytes, bytearray)):
|
||||||
|
blob.upload_from_file(io.BytesIO(data), content_type=content_type, rewind=True)
|
||||||
|
elif hasattr(data, "read"):
|
||||||
|
blob.upload_from_file(data, content_type=content_type, rewind=True)
|
||||||
|
elif isinstance(data, str) and os.path.exists(data):
|
||||||
|
blob.upload_from_filename(data, content_type=content_type)
|
||||||
|
else:
|
||||||
|
raise ValueError("data must be bytes, file-like, or existing filepath")
|
||||||
|
return {"name": blob.name, "size": blob.size, "content_type": blob.content_type}
|
||||||
|
|
||||||
|
def read_item(self, bucket: str, object_name: str, as_text: bool = False, encoding: str = "utf-8"):
|
||||||
|
data = self._blob(bucket, object_name).download_as_bytes()
|
||||||
|
return data.decode(encoding) if as_text else data
|
||||||
|
|
||||||
|
def delete_item(self, bucket: str, object_name: str):
|
||||||
|
"""オブジェクトを削除する"""
|
||||||
|
self._blob(bucket, object_name).delete()
|
||||||
|
|
||||||
|
def generate_signed_url(self, bucket: str, object_name: str, method: str = "GET",
|
||||||
|
expires: timedelta = timedelta(hours=1)) -> str:
|
||||||
|
return self._blob(bucket, object_name).generate_signed_url(expiration=expires, method=method)
|
||||||
|
|
||||||
|
def zip_items(
|
||||||
|
self,
|
||||||
|
bucket: str,
|
||||||
|
object_names: List[str],
|
||||||
|
) -> bytes:
|
||||||
|
"""
|
||||||
|
複数のGCSオブジェクトを1つのZIPにまとめ、ZIPバイナリ(bytes)を返す
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bucket (str): バケット名
|
||||||
|
object_names (List[str]): 対象オブジェクトのリスト
|
||||||
|
Returns:
|
||||||
|
bytes: ZIPファイルのバイナリ
|
||||||
|
"""
|
||||||
|
out = io.BytesIO()
|
||||||
|
with zipfile.ZipFile(out, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
|
||||||
|
for obj in object_names:
|
||||||
|
blob = self._blob(bucket, obj)
|
||||||
|
if not blob.exists():
|
||||||
|
raise FileNotFoundError(f"Object not found: gs://{bucket}/{obj}")
|
||||||
|
|
||||||
|
buf = io.BytesIO()
|
||||||
|
blob.download_to_file(buf)
|
||||||
|
buf.seek(0)
|
||||||
|
arcname = Path(obj).name
|
||||||
|
zf.writestr(arcname, buf.read())
|
||||||
|
|
||||||
|
zf.comment = f"bucket={bucket}, files={len(object_names)}".encode()
|
||||||
|
|
||||||
|
return out.getvalue()
|
||||||
|
|
||||||
|
def upload_folder(self, bucket: str, folder_path: str, gcs_prefix: str = ""):
|
||||||
|
"""
|
||||||
|
ローカルフォルダをGCSに再帰的にアップロードする
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bucket (str): バケット名
|
||||||
|
folder_path (str): ローカルフォルダのパス
|
||||||
|
gcs_prefix (str): GCS上のプレフィックス(フォルダパス)
|
||||||
|
"""
|
||||||
|
_bucket = self._bucket(bucket)
|
||||||
|
|
||||||
|
for root, _, files in os.walk(folder_path):
|
||||||
|
for file in files:
|
||||||
|
local_file_path = os.path.join(root, file)
|
||||||
|
# フォルダ構造を保つように相対パスを生成
|
||||||
|
relative_path = os.path.relpath(local_file_path, folder_path)
|
||||||
|
gcs_object_name = os.path.join(gcs_prefix, relative_path).replace("\\", "/")
|
||||||
|
|
||||||
|
blob = _bucket.blob(gcs_object_name)
|
||||||
|
blob.upload_from_filename(local_file_path)
|
||||||
|
logger.info(f"Uploaded {local_file_path} to gs://{bucket}/{gcs_object_name}")
|
||||||
|
|
||||||
|
def write_csv_item(
|
||||||
|
self,
|
||||||
|
bucket: str,
|
||||||
|
object_name: str,
|
||||||
|
records: List):
|
||||||
|
|
||||||
|
"""CSVデータをGCSにアップロードする
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bucket (str): バケット名
|
||||||
|
object_name (str): オブジェクト名
|
||||||
|
records (List): CSVデータのリスト
|
||||||
|
|
||||||
|
"""
|
||||||
|
blob = self._blob(bucket, object_name)
|
||||||
|
with blob.open("w", content_type="text/csv", newline="", encoding="utf-8") as f:
|
||||||
|
writer = csv.writer(f)
|
||||||
|
writer.writerows(records)
|
||||||
|
logger.info(f"Uploaded CSV to gs://{bucket}/{object_name}")
|
||||||
|
return {"name": blob.name, "size": blob.size, "content_type": blob.content_type}
|
||||||
6
src/utils/types.py
Normal file
6
src/utils/types.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
class DataLayer(str, Enum):
|
||||||
|
BRONZE = "bronze"
|
||||||
|
SILVER = "silver"
|
||||||
|
GOLD = "gold"
|
||||||
Loading…
x
Reference in New Issue
Block a user