import duckdb class DuckDBProvider: def __init__(self, db_path: str = ":memory:", read_only: bool = False): self.con = self.connect(db_path, read_only) def connect(self, db_path: str = ":memory:", read_only: bool = False): return duckdb.connect(database=db_path, read_only=read_only) def setup_gcs(self, access_key: str,secret_key: str): """GCSのシークレットを設定する""" if not self.con: raise ValueError("DuckDB is not connected.") self.con.sql(f""" CREATE OR REPLACE SECRET gcs_creds ( TYPE gcs, KEY_ID '{access_key}', SECRET '{secret_key}' ); """) def close(self): """接続を閉じる""" if self.con: self.con.close() def query_df(self, sql: str): """SQLクエリを実行してDataFrameで返す""" return self.con.execute(sql).df() def max_value( self, file_path: str, column: str, hive_partitioning: bool = True, union_by_name: bool = True, ) -> any: """CSVファイルの指定列の最大値を取得する""" query = f""" SELECT MAX({column}) AS max_{column} FROM read_csv_auto('{file_path}', hive_partitioning={1 if hive_partitioning else 0}, union_by_name={1 if union_by_name else 0} ) """ result = self.con.execute(query).fetchone()[0] return result def load_file(self, file_glob: str, table: str): """CSVを読み込みテーブル化""" sql = f""" CREATE OR REPLACE TABLE {table} AS SELECT * FROM read_csv_auto('{file_glob}', HEADER=TRUE, IGNORE_ERRORS=TRUE) """ self.con.execute(sql) @staticmethod def get_gs_csv_name( backet: str, object_name: str, hive_partitioning: bool = True, union_by_name: bool = True, ) -> str: return f"""read_csv_auto('gs://{backet}/{object_name}', hive_partitioning={1 if hive_partitioning else 0}, union_by_name={1 if union_by_name else 0} ) """