python-common-code/src/providers/duck_db_provider.py
2025-10-26 17:10:27 +09:00

73 lines
2.3 KiB
Python

import duckdb
class DuckDBProvider:
def __init__(self, db_path: str = ":memory:", read_only: bool = False):
self.con = self.connect(db_path, read_only)
def connect(self, db_path: str = ":memory:", read_only: bool = False):
return duckdb.connect(database=db_path, read_only=read_only)
def setup_gcs(self, access_key: str,secret_key: str):
"""GCSのシークレットを設定する"""
if not self.con:
raise ValueError("DuckDB is not connected.")
self.con.sql(f"""
CREATE OR REPLACE SECRET gcs_creds (
TYPE gcs,
KEY_ID '{access_key}',
SECRET '{secret_key}'
);
""")
def close(self):
"""接続を閉じる"""
if self.con:
self.con.close()
def query_df(self, sql: str):
"""SQLクエリを実行してDataFrameで返す"""
return self.con.execute(sql).df()
def max_value(
self,
file_path: str,
column: str,
hive_partitioning: bool = True,
union_by_name: bool = True,
) -> any:
"""CSVファイルの指定列の最大値を取得する"""
query = f"""
SELECT MAX({column}) AS max_{column}
FROM read_csv_auto('{file_path}',
hive_partitioning={1 if hive_partitioning else 0},
union_by_name={1 if union_by_name else 0}
)
"""
result = self.con.execute(query).fetchone()[0]
return result
def load_file(self, file_glob: str, table: str):
"""CSVを読み込みテーブル化"""
sql = f"""
CREATE OR REPLACE TABLE {table} AS
SELECT *
FROM read_csv_auto('{file_glob}', HEADER=TRUE, IGNORE_ERRORS=TRUE)
"""
self.con.execute(sql)
@staticmethod
def get_gs_csv_name(
backet: str,
object_name: str,
hive_partitioning: bool = True,
union_by_name: bool = True,
) -> str:
return f"""read_csv_auto('gs://{backet}/{object_name}',
hive_partitioning={1 if hive_partitioning else 0},
union_by_name={1 if union_by_name else 0}
)
"""