282 lines
9.2 KiB
Python
282 lines
9.2 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
merge_db.py – Merge multiple SQLite databases into one output file.
|
||
|
||
Rules
|
||
-----
|
||
* Tables `category` and `positionen` are deduplicated by ALL columns
|
||
(content-based identity, ignoring the local rowid/id column).
|
||
* Table `jobs` rows from later files are appended with a re-sequenced
|
||
`jobid` so IDs never clash across files.
|
||
* Rows are inserted in file order: the first file wins on deduplication
|
||
for reference tables; job rows from later files come after earlier ones.
|
||
|
||
Usage
|
||
-----
|
||
python merge_db.py -o merged.db 02.db 03.db [04.db ...]
|
||
"""
|
||
|
||
import argparse
|
||
import shutil
|
||
import sqlite3
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def get_table_names(con: sqlite3.Connection) -> list[str]:
|
||
cur = con.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
|
||
return [row[0] for row in cur.fetchall()]
|
||
|
||
|
||
def get_columns(con: sqlite3.Connection, table: str) -> list[str]:
|
||
cur = con.execute(f'PRAGMA table_info("{table}")')
|
||
return [row[1] for row in cur.fetchall()]
|
||
|
||
|
||
def get_create_statement(con: sqlite3.Connection, table: str) -> str:
|
||
cur = con.execute(
|
||
"SELECT sql FROM sqlite_master WHERE type='table' AND name=?", (table,)
|
||
)
|
||
row = cur.fetchone()
|
||
return row[0] if row else None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Merge logic per table type
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def merge_dedup_table(
|
||
src: sqlite3.Connection,
|
||
dst: sqlite3.Connection,
|
||
table: str,
|
||
id_col: str,
|
||
) -> tuple[int, int]:
|
||
"""
|
||
Merge a reference table using content-based deduplication.
|
||
|
||
Rows are compared on every column *except* `id_col`. If an identical
|
||
row already exists in `dst`, it is skipped; otherwise it is inserted
|
||
with a new auto-incremented id.
|
||
|
||
Returns (rows_read, rows_inserted).
|
||
"""
|
||
cols = get_columns(src, table)
|
||
content_cols = [c for c in cols if c != id_col]
|
||
|
||
if not content_cols:
|
||
return 0, 0
|
||
|
||
placeholders = ", ".join("?" * len(content_cols))
|
||
select_content = ", ".join(f'"{c}"' for c in content_cols)
|
||
|
||
# Build a set of existing content tuples for fast dedup lookup
|
||
existing: set[tuple] = set()
|
||
for row in dst.execute(f'SELECT {select_content} FROM "{table}"'):
|
||
existing.add(row)
|
||
|
||
src_rows = src.execute(f'SELECT {select_content} FROM "{table}"').fetchall()
|
||
inserted = 0
|
||
for row in src_rows:
|
||
key = tuple(row)
|
||
if key not in existing:
|
||
dst.execute(
|
||
f'INSERT INTO "{table}" ({select_content}) VALUES ({placeholders})',
|
||
row,
|
||
)
|
||
existing.add(key)
|
||
inserted += 1
|
||
|
||
return len(src_rows), inserted
|
||
|
||
|
||
def merge_jobs_table(
|
||
src: sqlite3.Connection,
|
||
dst: sqlite3.Connection,
|
||
table: str = "jobs",
|
||
id_col: str = "jobid",
|
||
) -> tuple[int, int]:
|
||
"""
|
||
Append job rows from `src` into `dst`, re-sequencing `id_col` so that
|
||
new IDs start after the current maximum in `dst`.
|
||
|
||
Duplicate detection is done on all columns except `id_col`; rows with
|
||
identical content are skipped.
|
||
|
||
Returns (rows_read, rows_inserted).
|
||
"""
|
||
cols = get_columns(src, table)
|
||
content_cols = [c for c in cols if c != id_col]
|
||
all_col_str = ", ".join(f'"{c}"' for c in cols)
|
||
content_str = ", ".join(f'"{c}"' for c in content_cols)
|
||
placeholders = ", ".join("?" * len(cols))
|
||
|
||
# Current max id in destination
|
||
row = dst.execute(f'SELECT MAX("{id_col}") FROM "{table}"').fetchone()
|
||
offset = (row[0] or 0)
|
||
|
||
# Existing content fingerprints for dedup
|
||
existing: set[tuple] = set()
|
||
for row in dst.execute(f'SELECT {content_str} FROM "{table}"'):
|
||
existing.add(tuple(row))
|
||
|
||
src_rows = src.execute(f'SELECT {all_col_str} FROM "{table}"').fetchall()
|
||
col_index = {c: i for i, c in enumerate(cols)}
|
||
id_idx = col_index[id_col]
|
||
|
||
inserted = 0
|
||
for row in src_rows:
|
||
content_key = tuple(row[col_index[c]] for c in content_cols)
|
||
if content_key in existing:
|
||
continue # skip duplicate content
|
||
new_id = row[id_idx] + offset
|
||
new_row = list(row)
|
||
new_row[id_idx] = new_id
|
||
dst.execute(
|
||
f'INSERT INTO "{table}" ({all_col_str}) VALUES ({placeholders})',
|
||
new_row,
|
||
)
|
||
existing.add(content_key)
|
||
inserted += 1
|
||
|
||
return len(src_rows), inserted
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Schema helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def ensure_table(dst: sqlite3.Connection, src: sqlite3.Connection, table: str) -> None:
|
||
"""Create `table` in `dst` if it doesn't exist yet, using src's DDL."""
|
||
exists = dst.execute(
|
||
"SELECT 1 FROM sqlite_master WHERE type='table' AND name=?", (table,)
|
||
).fetchone()
|
||
if not exists:
|
||
ddl = get_create_statement(src, table)
|
||
if ddl:
|
||
dst.execute(ddl)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Main merge routine
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Map table name → (id_column, merge_strategy)
|
||
# strategy: "dedup" → content-based dedup, new ids assigned
|
||
# "jobs" → append with id re-sequencing + content dedup
|
||
TABLE_CONFIG: dict[str, tuple[str, str]] = {
|
||
"category": ("catid", "dedup"),
|
||
"positionen": ("posid", "dedup"),
|
||
"jobs": ("jobid", "jobs"),
|
||
}
|
||
|
||
|
||
def merge_databases(input_files: list[Path], output_file: Path) -> None:
|
||
# Bootstrap: copy first file as the base for the output
|
||
print(f"[init] Bootstrapping output from '{input_files[0].name}' …")
|
||
shutil.copy2(input_files[0], output_file)
|
||
|
||
dst = sqlite3.connect(output_file)
|
||
dst.execute("PRAGMA journal_mode=WAL")
|
||
dst.execute("PRAGMA foreign_keys=OFF")
|
||
|
||
try:
|
||
for src_path in input_files[1:]:
|
||
print(f"\n[merge] Processing '{src_path.name}' …")
|
||
src = sqlite3.connect(src_path)
|
||
|
||
src_tables = get_table_names(src)
|
||
for table in src_tables:
|
||
ensure_table(dst, src, table)
|
||
config = TABLE_CONFIG.get(table)
|
||
|
||
if config is None:
|
||
# Unknown table: fall back to content-based dedup without
|
||
# special id handling – treat first column as the id.
|
||
cols = get_columns(src, table)
|
||
id_col = cols[0] if cols else None
|
||
if id_col:
|
||
read, ins = merge_dedup_table(src, dst, table, id_col)
|
||
print(f" [{table}] (fallback dedup on '{id_col}') "
|
||
f"read={read} inserted={ins}")
|
||
else:
|
||
id_col, strategy = config
|
||
if strategy == "dedup":
|
||
read, ins = merge_dedup_table(src, dst, table, id_col)
|
||
else:
|
||
read, ins = merge_jobs_table(src, dst, table, id_col)
|
||
print(f" [{table}] strategy={strategy} "
|
||
f"read={read} inserted={ins}")
|
||
|
||
dst.commit()
|
||
src.close()
|
||
|
||
finally:
|
||
dst.close()
|
||
|
||
print(f"\n[done] Output written to '{output_file}'")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CLI
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def build_parser() -> argparse.ArgumentParser:
|
||
parser = argparse.ArgumentParser(
|
||
prog="merge_db",
|
||
description=(
|
||
"Merge multiple SQLite databases into one output file.\n"
|
||
"Files are merged in the order they are provided.\n"
|
||
"Duplicate rows are detected by content and inserted only once."
|
||
),
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog=(
|
||
"Examples:\n"
|
||
" python merge_db.py -o merged.db 02.db 03.db\n"
|
||
" python merge_db.py -o merged.db 02.db 03.db 04.db 05.db\n"
|
||
),
|
||
)
|
||
parser.add_argument(
|
||
"inputs",
|
||
metavar="INPUT_DB",
|
||
nargs="+",
|
||
help="Input SQLite database files (in desired merge order)",
|
||
)
|
||
parser.add_argument(
|
||
"-o", "--output",
|
||
metavar="OUTPUT_DB",
|
||
required=True,
|
||
help="Path for the merged output database (will be overwritten if it exists)",
|
||
)
|
||
return parser
|
||
|
||
|
||
def main() -> None:
|
||
parser = build_parser()
|
||
args = parser.parse_args()
|
||
|
||
input_paths = [Path(p) for p in args.inputs]
|
||
output_path = Path(args.output)
|
||
|
||
# Validate inputs
|
||
for p in input_paths:
|
||
if not p.exists():
|
||
parser.error(f"Input file not found: {p}")
|
||
if not p.is_file():
|
||
parser.error(f"Not a file: {p}")
|
||
|
||
if len(input_paths) < 2:
|
||
parser.error("At least two input files are required.")
|
||
|
||
if output_path.exists():
|
||
print(f"[warn] Output file '{output_path}' already exists and will be overwritten.")
|
||
|
||
merge_databases(input_paths, output_path)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|