python-common-code/src/providers/duck_db_provider.py

import duckdb


class DuckDBProvider:

    def __init__(self, db_path: str = ":memory:", read_only: bool = False):
        self.con = self.connect(db_path, read_only)

    def connect(self, db_path: str = ":memory:", read_only: bool = False):
        return duckdb.connect(database=db_path, read_only=read_only)

    def setup_gcs(self, access_key: str,secret_key: str):
        """GCSのシークレットを設定する"""
        if not self.con:
            raise ValueError("DuckDB is not connected.")
        self.con.sql(f"""
            CREATE OR REPLACE SECRET gcs_creds (
                TYPE gcs,
                KEY_ID '{access_key}',
                SECRET '{secret_key}'
            );
        """)


    def close(self):
        """接続を閉じる"""
        if self.con:
            self.con.close()

    def query_df(self, sql: str):
        """SQLクエリを実行してDataFrameで返す"""
        return self.con.execute(sql).df()

    def max_value(
        self,
        file_path: str,
        column: str,
        hive_partitioning: bool = True,
        union_by_name: bool = True,
    ) -> any:
        """CSVファイルの指定列の最大値を取得する"""
        query = f"""
            SELECT MAX({column}) AS max_{column}
            FROM read_csv_auto('{file_path}',
            hive_partitioning={1 if hive_partitioning else 0},
            union_by_name={1 if union_by_name else 0}
            )
        """
        result = self.con.execute(query).fetchone()[0]
        return result

    def load_file(self, file_glob: str, table: str):
        """CSVを読み込みテーブル化"""
        sql = f"""
        CREATE OR REPLACE TABLE {table} AS
        SELECT *
        FROM read_csv_auto('{file_glob}', HEADER=TRUE, IGNORE_ERRORS=TRUE)
        """
        self.con.execute(sql)

    @staticmethod
    def get_gs_csv_name(
                        backet: str,
                        object_name: str,
                        hive_partitioning: bool = True,
                        union_by_name: bool = True,
                        ) -> str:

        return f"""read_csv_auto('gs://{backet}/{object_name}',
        hive_partitioning={1 if hive_partitioning else 0},
        union_by_name={1 if union_by_name else 0}
        )
        """