add merge tool

This commit is contained in:
2026-04-21 21:41:17 +02:00
parent 91a6c4a304
commit d4f89f39bd
2 changed files with 294 additions and 0 deletions

281
tools/merge_db.py Normal file
View File

@ -0,0 +1,281 @@
#!/usr/bin/env python3
"""
merge_db.py Merge multiple SQLite databases into one output file.
Rules
-----
* Tables `category` and `positionen` are deduplicated by ALL columns
(content-based identity, ignoring the local rowid/id column).
* Table `jobs` rows from later files are appended with a re-sequenced
`jobid` so IDs never clash across files.
* Rows are inserted in file order: the first file wins on deduplication
for reference tables; job rows from later files come after earlier ones.
Usage
-----
python merge_db.py -o merged.db 02.db 03.db [04.db ...]
"""
import argparse
import shutil
import sqlite3
import sys
from pathlib import Path
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def get_table_names(con: sqlite3.Connection) -> list[str]:
cur = con.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
return [row[0] for row in cur.fetchall()]
def get_columns(con: sqlite3.Connection, table: str) -> list[str]:
cur = con.execute(f'PRAGMA table_info("{table}")')
return [row[1] for row in cur.fetchall()]
def get_create_statement(con: sqlite3.Connection, table: str) -> str:
cur = con.execute(
"SELECT sql FROM sqlite_master WHERE type='table' AND name=?", (table,)
)
row = cur.fetchone()
return row[0] if row else None
# ---------------------------------------------------------------------------
# Merge logic per table type
# ---------------------------------------------------------------------------
def merge_dedup_table(
src: sqlite3.Connection,
dst: sqlite3.Connection,
table: str,
id_col: str,
) -> tuple[int, int]:
"""
Merge a reference table using content-based deduplication.
Rows are compared on every column *except* `id_col`. If an identical
row already exists in `dst`, it is skipped; otherwise it is inserted
with a new auto-incremented id.
Returns (rows_read, rows_inserted).
"""
cols = get_columns(src, table)
content_cols = [c for c in cols if c != id_col]
if not content_cols:
return 0, 0
placeholders = ", ".join("?" * len(content_cols))
select_content = ", ".join(f'"{c}"' for c in content_cols)
# Build a set of existing content tuples for fast dedup lookup
existing: set[tuple] = set()
for row in dst.execute(f'SELECT {select_content} FROM "{table}"'):
existing.add(row)
src_rows = src.execute(f'SELECT {select_content} FROM "{table}"').fetchall()
inserted = 0
for row in src_rows:
key = tuple(row)
if key not in existing:
dst.execute(
f'INSERT INTO "{table}" ({select_content}) VALUES ({placeholders})',
row,
)
existing.add(key)
inserted += 1
return len(src_rows), inserted
def merge_jobs_table(
src: sqlite3.Connection,
dst: sqlite3.Connection,
table: str = "jobs",
id_col: str = "jobid",
) -> tuple[int, int]:
"""
Append job rows from `src` into `dst`, re-sequencing `id_col` so that
new IDs start after the current maximum in `dst`.
Duplicate detection is done on all columns except `id_col`; rows with
identical content are skipped.
Returns (rows_read, rows_inserted).
"""
cols = get_columns(src, table)
content_cols = [c for c in cols if c != id_col]
all_col_str = ", ".join(f'"{c}"' for c in cols)
content_str = ", ".join(f'"{c}"' for c in content_cols)
placeholders = ", ".join("?" * len(cols))
# Current max id in destination
row = dst.execute(f'SELECT MAX("{id_col}") FROM "{table}"').fetchone()
offset = (row[0] or 0)
# Existing content fingerprints for dedup
existing: set[tuple] = set()
for row in dst.execute(f'SELECT {content_str} FROM "{table}"'):
existing.add(tuple(row))
src_rows = src.execute(f'SELECT {all_col_str} FROM "{table}"').fetchall()
col_index = {c: i for i, c in enumerate(cols)}
id_idx = col_index[id_col]
inserted = 0
for row in src_rows:
content_key = tuple(row[col_index[c]] for c in content_cols)
if content_key in existing:
continue # skip duplicate content
new_id = row[id_idx] + offset
new_row = list(row)
new_row[id_idx] = new_id
dst.execute(
f'INSERT INTO "{table}" ({all_col_str}) VALUES ({placeholders})',
new_row,
)
existing.add(content_key)
inserted += 1
return len(src_rows), inserted
# ---------------------------------------------------------------------------
# Schema helpers
# ---------------------------------------------------------------------------
def ensure_table(dst: sqlite3.Connection, src: sqlite3.Connection, table: str) -> None:
"""Create `table` in `dst` if it doesn't exist yet, using src's DDL."""
exists = dst.execute(
"SELECT 1 FROM sqlite_master WHERE type='table' AND name=?", (table,)
).fetchone()
if not exists:
ddl = get_create_statement(src, table)
if ddl:
dst.execute(ddl)
# ---------------------------------------------------------------------------
# Main merge routine
# ---------------------------------------------------------------------------
# Map table name → (id_column, merge_strategy)
# strategy: "dedup" → content-based dedup, new ids assigned
# "jobs" → append with id re-sequencing + content dedup
TABLE_CONFIG: dict[str, tuple[str, str]] = {
"category": ("catid", "dedup"),
"positionen": ("posid", "dedup"),
"jobs": ("jobid", "jobs"),
}
def merge_databases(input_files: list[Path], output_file: Path) -> None:
# Bootstrap: copy first file as the base for the output
print(f"[init] Bootstrapping output from '{input_files[0].name}'")
shutil.copy2(input_files[0], output_file)
dst = sqlite3.connect(output_file)
dst.execute("PRAGMA journal_mode=WAL")
dst.execute("PRAGMA foreign_keys=OFF")
try:
for src_path in input_files[1:]:
print(f"\n[merge] Processing '{src_path.name}'")
src = sqlite3.connect(src_path)
src_tables = get_table_names(src)
for table in src_tables:
ensure_table(dst, src, table)
config = TABLE_CONFIG.get(table)
if config is None:
# Unknown table: fall back to content-based dedup without
# special id handling treat first column as the id.
cols = get_columns(src, table)
id_col = cols[0] if cols else None
if id_col:
read, ins = merge_dedup_table(src, dst, table, id_col)
print(f" [{table}] (fallback dedup on '{id_col}') "
f"read={read} inserted={ins}")
else:
id_col, strategy = config
if strategy == "dedup":
read, ins = merge_dedup_table(src, dst, table, id_col)
else:
read, ins = merge_jobs_table(src, dst, table, id_col)
print(f" [{table}] strategy={strategy} "
f"read={read} inserted={ins}")
dst.commit()
src.close()
finally:
dst.close()
print(f"\n[done] Output written to '{output_file}'")
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
prog="merge_db",
description=(
"Merge multiple SQLite databases into one output file.\n"
"Files are merged in the order they are provided.\n"
"Duplicate rows are detected by content and inserted only once."
),
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=(
"Examples:\n"
" python merge_db.py -o merged.db 02.db 03.db\n"
" python merge_db.py -o merged.db 02.db 03.db 04.db 05.db\n"
),
)
parser.add_argument(
"inputs",
metavar="INPUT_DB",
nargs="+",
help="Input SQLite database files (in desired merge order)",
)
parser.add_argument(
"-o", "--output",
metavar="OUTPUT_DB",
required=True,
help="Path for the merged output database (will be overwritten if it exists)",
)
return parser
def main() -> None:
parser = build_parser()
args = parser.parse_args()
input_paths = [Path(p) for p in args.inputs]
output_path = Path(args.output)
# Validate inputs
for p in input_paths:
if not p.exists():
parser.error(f"Input file not found: {p}")
if not p.is_file():
parser.error(f"Not a file: {p}")
if len(input_paths) < 2:
parser.error("At least two input files are required.")
if output_path.exists():
print(f"[warn] Output file '{output_path}' already exists and will be overwritten.")
merge_databases(input_paths, output_path)
if __name__ == "__main__":
main()