73 lines
2.3 KiB
Python
73 lines
2.3 KiB
Python
import duckdb
|
|
|
|
|
|
class DuckDBProvider:
|
|
|
|
def __init__(self, db_path: str = ":memory:", read_only: bool = False):
|
|
self.con = self.connect(db_path, read_only)
|
|
|
|
def connect(self, db_path: str = ":memory:", read_only: bool = False):
|
|
return duckdb.connect(database=db_path, read_only=read_only)
|
|
|
|
def setup_gcs(self, access_key: str,secret_key: str):
|
|
"""GCSのシークレットを設定する"""
|
|
if not self.con:
|
|
raise ValueError("DuckDB is not connected.")
|
|
self.con.sql(f"""
|
|
CREATE OR REPLACE SECRET gcs_creds (
|
|
TYPE gcs,
|
|
KEY_ID '{access_key}',
|
|
SECRET '{secret_key}'
|
|
);
|
|
""")
|
|
|
|
|
|
def close(self):
|
|
"""接続を閉じる"""
|
|
if self.con:
|
|
self.con.close()
|
|
|
|
def query_df(self, sql: str):
|
|
"""SQLクエリを実行してDataFrameで返す"""
|
|
return self.con.execute(sql).df()
|
|
|
|
def max_value(
|
|
self,
|
|
file_path: str,
|
|
column: str,
|
|
hive_partitioning: bool = True,
|
|
union_by_name: bool = True,
|
|
) -> any:
|
|
"""CSVファイルの指定列の最大値を取得する"""
|
|
query = f"""
|
|
SELECT MAX({column}) AS max_{column}
|
|
FROM read_csv_auto('{file_path}',
|
|
hive_partitioning={1 if hive_partitioning else 0},
|
|
union_by_name={1 if union_by_name else 0}
|
|
)
|
|
"""
|
|
result = self.con.execute(query).fetchone()[0]
|
|
return result
|
|
|
|
def load_file(self, file_glob: str, table: str):
|
|
"""CSVを読み込みテーブル化"""
|
|
sql = f"""
|
|
CREATE OR REPLACE TABLE {table} AS
|
|
SELECT *
|
|
FROM read_csv_auto('{file_glob}', HEADER=TRUE, IGNORE_ERRORS=TRUE)
|
|
"""
|
|
self.con.execute(sql)
|
|
|
|
@staticmethod
|
|
def get_gs_csv_name(
|
|
backet: str,
|
|
object_name: str,
|
|
hive_partitioning: bool = True,
|
|
union_by_name: bool = True,
|
|
) -> str:
|
|
|
|
return f"""read_csv_auto('gs://{backet}/{object_name}',
|
|
hive_partitioning={1 if hive_partitioning else 0},
|
|
union_by_name={1 if union_by_name else 0}
|
|
)
|
|
""" |