This commit is contained in:
2026-04-03 00:49:35 +02:00
parent 18526f8b45
commit 60fb674c8d
5 changed files with 751 additions and 0 deletions

View File

@@ -0,0 +1,126 @@
import argparse
from udpak_semistruktur.config import valider_yaml
from udpak_semistruktur.logger import opsaet_logging, hent_logger
from udpak_semistruktur import ddl
from udpak_semistruktur.extract.reader import læs_filer
from udpak_semistruktur.extract.extractor import generer_datafil
from udpak_semistruktur.transform.clean import rens, tag_strip, fjern_linjeskift, upper_lower, filename
from udpak_semistruktur.transform.reshape import flatten, join, where, id_felt, sammensat_noegle
from udpak_semistruktur.transform.convert import konverter
from udpak_semistruktur.transform.hash import beregn_hash
from udpak_semistruktur.load.file_writer import generer_filer_med_overskrifter, skriv_fil_med_retry
from udpak_semistruktur.load.db_writer import get_ase_connection_windows, insert_rows_ase
from udpak_semistruktur.db import læs_json_fil
from udpak_semistruktur.utils import generer_filnavn
logger = hent_logger(__name__)
def _byg_argument_parser() -> argparse.ArgumentParser:
"""Bygger og returnerer CLI argument-parseren."""
parser = argparse.ArgumentParser(
description="Udtræk og transformation af semistrukturerede data (JSON/XML)."
)
parser.add_argument("--config", required=True, help="Sti til YAML-konfigurationsfil")
# Tilføj DDL-flags via ddl-modulet
ddl.add_cli_args(parser)
return parser
def _kør_udtræk(config: dict, global_config: dict) -> None:
"""Kører den normale udtræks- og transformationspipeline."""
input_fil = global_config.get("input_fil")
input_fil_liste = global_config.get("input_fil_liste")
# Opret DB-forbindelse hvis der er tabel-output
har_tabel_output = any(
cfg.get("type") == "tabel"
for cfg in config.get("output_filer", [])
)
conn = None
if har_tabel_output:
bruger, password, env, host, port = læs_json_fil(global_config)
conn = get_ase_connection_windows(
bruger, password, host, port,
global_config["database"]
)
for record in læs_filer(global_config, input_fil, input_fil_liste):
for cfg in config.get("output_filer", []):
# 1) Udtræk
tmp_data = generer_datafil(record, cfg, global_config)
# 2) Transform-pipeline
tmp_data = join(tmp_data, cfg)
tmp_data = flatten(tmp_data, cfg)
tmp_data = rens(tmp_data, cfg, global_config)
tmp_data = tag_strip(tmp_data, cfg, global_config)
tmp_data = fjern_linjeskift(tmp_data, cfg, global_config)
tmp_data = where(tmp_data, cfg, global_config)
tmp_data = konverter(tmp_data, cfg, global_config)
tmp_data = upper_lower(tmp_data, cfg, global_config)
tmp_data = id_felt(tmp_data, cfg)
tmp_data = sammensat_noegle(tmp_data, cfg, global_config)
tmp_data = beregn_hash(tmp_data, cfg, global_config)
tmp_data = filename(tmp_data, cfg, global_config)
# 3) Load afhænger af cfg["type"]
if cfg.get("type") == "fil":
fil_navn = generer_filnavn(cfg["fil_navn"], global_config)
output_sti = global_config["output_path"] + fil_navn
overskrifter = cfg.get("overskrifter", True)
generer_filer_med_overskrifter(overskrifter, output_sti, cfg["kolonner"], global_config)
separator = global_config["separator"]
encoding = global_config["encoding"]
# Bemærk: skriv() lukker over loop-variabler kaldes straks af skriv_fil_med_retry
def skriv():
with open(output_sti, "a", encoding=encoding) as f:
for række in tmp_data["rækker"]:
linje = separator.join(
str(række.get(k["navn"], "")) for k in cfg["kolonner"]
)
f.write(linje + "\n")
skriv_fil_med_retry(skriv, output_sti)
logger.info(f"Fil: {output_sti} skrevet ({len(tmp_data['rækker'])} rækker)")
elif cfg.get("type") == "tabel":
kolonner = [k["navn"] for k in cfg["kolonner"]]
indsatte, fejlede = insert_rows_ase(conn, cfg["tabel_navn"], kolonner, tmp_data["rækker"])
logger.info(f"DB: {indsatte} rækker indsat i {cfg['tabel_navn']}")
if fejlede:
logger.warning(f"DB: {len(fejlede)} rækker fejlede i {cfg['tabel_navn']}")
if conn is not None:
conn.close()
logger.debug("DB-forbindelse lukket.")
def main():
"""Hovedfunktion der eksekveres ved kørsel af scriptet."""
parser = _byg_argument_parser()
args = parser.parse_args()
# Valider og indlæs YAML-konfiguration
config = valider_yaml(args.config)
global_config = config["config"]
# Opsæt logging baseret på argumenter
opsaet_logging(
log_fil=global_config["logfil"],
niveau=global_config.get("log_niveau", "info"),
)
# Eksekver DDL-flowet
if ddl.is_enabled(args):
ddl.run_ddl_mode(args, config, global_config)
else:
_kør_udtræk(config, global_config)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,387 @@
"""
ddl_tool_ase.py
Sybase ASE helper for:
- DDL generation (CREATE TABLE) for base + tmp tables
- Flyt scripts (DELETE with JOIN placeholder + INSERT/SELECT)
- Collecting combined sql files
Design goals (as requested):
- YAML may contain _tmp table names (because the extractor loads into _tmp)
- DDL generation should produce BOTH:
- base table (without _tmp)
- tmp table (with _tmp)
when YAML ends with _tmp
- If YAML does NOT end with _tmp:
- always generate base DDL
- generate tmp DDL only when --tmp is given
- Flyt scripts always move tmp -> base
- Sybase ASE DELETE must NOT use table aliases
- No join_cols intelligence; only placeholders (ON 1=1 + commented AND lines)
"""
from __future__ import annotations
import os
from copy import deepcopy
from typing import Dict, List, Tuple, Any
from udpak_semistruktur.utils import generer_filnavn
from udpak_semistruktur.logger import hent_logger
logger = hent_logger(__name__)
# ------------------------------------------------------------
# CLI wiring
# ------------------------------------------------------------
def add_cli_args(parser) -> None:
"""
Add CLI flags used by DDL mode to your argparse parser.
"""
parser.add_argument("--DDL", action="store_true", help="Generate DDL (CREATE TABLE) files")
parser.add_argument("--tmp", action="store_true", help="Also generate _tmp table variants when YAML table is base")
parser.add_argument("--flyt", action="store_true", help="Also generate delete+insert (move) scripts for Sybase ASE")
parser.add_argument("--flyt_kort", action="store_true", help="Generere delete+insert statements, men i kort version.")
def is_enabled(args) -> bool:
return bool(getattr(args, "DDL", False))
# ------------------------------------------------------------
# Name helpers
# ------------------------------------------------------------
def _map_yaml_type_to_ase(col: dict, dato_ud_global = "%Y-%m-%d") -> str:
"""
Map YAML kolonnefelt til Sybase ASE SQL-type.
YAML keys vi kigger efter:
- type: string|integer|float|decimal|boolean|date|hash|id|file
- max_længde / length: int (til varchar)
- precision: int (til decimal)
- decimaler: int (scale til decimal)
Fallback for ukendt/uden type: VARCHAR(255)
"""
t = str(col.get("type", "string")).lower()
length = col.get("max_længde", col.get("length", col.get("truncate")))
precision = col.get("precision", 18)
scale = col.get("decimaler", 2)
dato_fmt = col.get("dato_ud", dato_ud_global)
if t in ("string", "hash", "id", "file"):
n = int(length) if length else 50
return f"VARCHAR({n})"
if t in ("integer", "bigint"):
return "INT"
if t in ("float", "decimal"):
return f"DECIMAL({precision},{scale})"
if t == "boolean":
# BIT findes, men TINYINT er ofte mere kompatibelt i ASE
return "TINYINT"
if t == "date":
time_codes = ['%H', '%M', '%S', '%I', '%p', '%f']
if (any(code in dato_fmt for code in time_codes)) or (dato_fmt.upper() == 'SYBASE'):
return "DATETIME"
return "DATE"
# default fallback
return "VARCHAR(50)"
def split_base_tmp(table_name: str) -> Tuple[str, str]:
"""
Returns (base_table, tmp_table).
If table_name ends with _tmp:
base_table = table_name without _tmp
tmp_table = table_name (as-is)
Else:
base_table = table_name
tmp_table = table_name + _tmp
"""
if table_name.lower().endswith("_tmp"):
return table_name[:-4], table_name
return table_name, f"{table_name}_tmp"
def _safe_name(name: str) -> str:
return name.replace(".", "_")
def _default_ddl_filename(table_name: str) -> str:
return f"{_safe_name(table_name)}_create.sql"
def _default_delete_filename(base_table: str) -> str:
return f"{_safe_name(base_table)}_delete.sql"
def _default_flyt_filename(base_table: str) -> str:
return f"{_safe_name(base_table)}_flyt.sql"
def _skriv_flyt_scripts(
tabel: str,
base_tabel: str,
tmp_tabel: str,
file_conf: dict,
outdir: str,
insert_func: callable,
samlet_flyt_indhold: list,
) -> None:
"""Genererer og skriver delete- og flyt-scripts for én tabel."""
kolonner = [kol["navn"] for kol in file_conf.get("kolonner", [])]
delete_sql = generate_delete_join_sql(tabel, kolonner)
flyt_sql = insert_func(tabel, kolonner)
delete_path = os.path.join(outdir, _default_delete_filename(base_tabel))
flyt_path = os.path.join(outdir, _default_flyt_filename(base_tabel))
with open(delete_path, "w", encoding="utf-8") as f:
f.write(delete_sql)
with open(flyt_path, "w", encoding="utf-8") as f:
f.write(flyt_sql)
samlet_flyt_indhold.append(f"-- DELETE: {base_tabel} (match mod {tmp_tabel})\n{delete_sql}\n")
samlet_flyt_indhold.append(f"-- FLYT: {tmp_tabel} -> {base_tabel}\n{flyt_sql}\n")
logger.info(f"[FLYT] Skrev {delete_path}")
logger.info(f"[FLYT] Skrev {flyt_path}")
# ------------------------------------------------------------
# SQL generators (Sybase ASE)
# ------------------------------------------------------------
def generate_create_table_sql(file_config: dict, global_config: dict) -> str:
"""
Genererer CREATE TABLE DDL for én output-fil-konfiguration.
Kræver at file_config['tabel_navn'] er sat.
- Kolonnenavn = kol["navn"]
- Type = YAML 'type' -> ASE type (fallback VARCHAR(255))
- NULL/NOT NULL = 'påkrævet' (True => NOT NULL)
- Optional 'primary_key': true (på en eller flere kolonner) -> PRIMARY KEY
Returnerer en streng med DROP-IF-EXISTS + CREATE TABLE.
"""
table = file_config.get("tabel_navn")
if not table:
raise ValueError("Kan ikke generere DDL: 'tabel_navn' mangler i output_filer-blok.")
cols = file_config.get("kolonner", [])
if not cols:
raise ValueError(f"Kan ikke generere DDL for {table}: 'kolonner' er tom.")
# Byg kolonne-linjer
col_lines = []
pk_cols = []
for col in cols:
name = col["navn"]
sql_type = _map_yaml_type_to_ase(col)
not_null = "NOT NULL" if col.get("påkrævet") else "NULL"
col_lines.append(f' "{name}" {sql_type} {not_null}')
if col.get("primary_key"):
pk_cols.append(name)
# PRIMARY KEY (hvis angivet)
pk_line = ""
if pk_cols:
cols_list = ", ".join(f'"{c}"' for c in pk_cols)
pk_line = f",\n PRIMARY KEY ({cols_list})"
cols_block = ",\n".join(col_lines) + pk_line
# DROP IF EXISTS til ASE (sysobjects)
# (Tilpas evt. schema-adskillelse; her antager vi at table kan være dbo.MinTabel)
schema_qualified = table
table_only = table.split(".")[-1]
drop_part = (
f"IF EXISTS (SELECT 1 FROM sysobjects WHERE name = '{table_only}' AND type = 'U')\n"
f"BEGIN\n"
f" DROP TABLE {schema_qualified}\n"
f"END\nGO\n\n"
)
create_part = (
f"CREATE TABLE {schema_qualified} (\n"
f"{cols_block}\n"
f");\nGO\n"
)
return drop_part + create_part
def generate_delete_join_sql(table_name_from_yaml: str, columns: List[str]) -> str:
"""
Sybase ASE-compatible DELETE with JOIN placeholder.
IMPORTANT: No aliases in DELETE in Sybase ASE.
Output:
DELETE FROM base
FROM base
JOIN tmp
ON 1 = 1
-- AND base.col = tmp.col
"""
base_table, tmp_table = split_base_tmp(table_name_from_yaml)
lines: List[str] = []
lines.append(f"DELETE FROM {base_table}")
lines.append(f"FROM {base_table}")
lines.append(f"JOIN {tmp_table}")
lines.append(" ON 1 = 1")
for col in columns:
lines.append(f" -- AND {base_table}.{col} = {tmp_table}.{col}")
lines.append(" ")
return "\n".join(lines)
def generate_insert_move_sql(table_name_from_yaml: str, columns: List[str]) -> str:
"""
INSERT INTO base SELECT FROM tmp (all columns, explicit list).
"""
base_table, tmp_table = split_base_tmp(table_name_from_yaml)
cols_block = ",\n ".join(columns)
lines: List[str] = []
lines.append(f"INSERT INTO {base_table} (")
lines.append(f" {cols_block}")
lines.append(")")
lines.append("SELECT")
lines.append(f" {cols_block}")
lines.append(f"FROM {tmp_table}")
lines.append(" ")
return "\n".join(lines)
def generate_insert_move_sql_short(table_name_from_yaml: str, columns: List[str]) -> str:
"""
INSERT INTO base SELECT FROM tmp (*).
"""
base_table, tmp_table = split_base_tmp(table_name_from_yaml)
cols_block = ",\n ".join(columns)
lines: List[str] = []
lines.append(f"INSERT INTO {base_table} ")
lines.append("SELECT * ")
lines.append(f"FROM {tmp_table}")
lines.append(" ")
return "\n".join(lines)
# ------------------------------------------------------------
# DDL mode runner
# ------------------------------------------------------------
def run_ddl_mode(args, config: Dict[str, Any], global_config: Dict[str, Any]) -> None:
"""
Executes the full DDL-only flow and writes files.
"""
outdir = os.path.join(global_config["output_path"], "sql")
os.makedirs(outdir, exist_ok=True)
antal = 0
samlet_sql_indhold: List[str] = []
samlet_flyt_indhold: List[str] = []
output_filer = config.get("output_filer", [])
for file_conf in output_filer:
tabel = file_conf.get("tabel_navn")
if not tabel:
continue
try:
base_tabel, tmp_tabel = split_base_tmp(tabel)
yaml_is_tmp = tabel.lower().endswith("_tmp")
# ---------------------------------------------------------
# 1) Base DDL (always)
# ---------------------------------------------------------
base_conf = deepcopy(file_conf)
base_conf["tabel_navn"] = base_tabel
ddl_sql_base = generate_create_table_sql(base_conf, global_config)
samlet_sql_indhold.append(f"-- Tabel: {base_tabel}\n{ddl_sql_base}\n")
# If YAML already is base, respect ddl_fil_navn if present.
# If YAML is tmp (base differs), don't reuse ddl_fil_navn blindly.
if (not yaml_is_tmp) and file_conf.get("ddl_fil_navn"):
ddl_base_name = file_conf["ddl_fil_navn"]
else:
ddl_base_name = _default_ddl_filename(base_tabel)
ddl_base_name = generer_filnavn(ddl_base_name, global_config)
ddl_base_path = os.path.join(outdir, ddl_base_name)
with open(ddl_base_path, "w", encoding="utf-8") as f:
f.write(ddl_sql_base)
logger.info(f"[DDL] Skrev {ddl_base_path}")
antal += 1
# ---------------------------------------------------------
# 2) TMP DDL
# - If YAML is _tmp: ALWAYS generate tmp too
# - Else: only when --tmp is set
# ---------------------------------------------------------
skal_lave_tmp = yaml_is_tmp or bool(getattr(args, "tmp", False))
if skal_lave_tmp:
tmp_conf = deepcopy(file_conf)
tmp_conf["tabel_navn"] = tmp_tabel
ddl_sql_tmp = generate_create_table_sql(tmp_conf, global_config)
samlet_sql_indhold.append(f"-- Tabel: {tmp_tabel}\n{ddl_sql_tmp}\n")
ddl_tmp_name = _default_ddl_filename(tmp_tabel)
ddl_tmp_name = generer_filnavn(ddl_tmp_name, global_config)
ddl_tmp_path = os.path.join(outdir, ddl_tmp_name)
with open(ddl_tmp_path, "w", encoding="utf-8") as f_tmp:
f_tmp.write(ddl_sql_tmp)
logger.info(f"[DDL] Skrev {ddl_tmp_path}")
antal += 1
# ---------------------------------------------------------
# 3) Flyt scripts (Sybase ASE) if --flyt
# Always based on YAML table name, which determines base/tmp
# ---------------------------------------------------------
if getattr(args, "flyt", False):
_skriv_flyt_scripts(tabel, base_tabel, tmp_tabel, file_conf, outdir,
generate_insert_move_sql, samlet_flyt_indhold)
# ---------------------------------------------------------
# 4) Flyt scripts (Sybase ASE) if --flyt_kort
# Always based on YAML table name, which determines base/tmp
# ---------------------------------------------------------
if getattr(args, "flyt_kort", False):
_skriv_flyt_scripts(tabel, base_tabel, tmp_tabel, file_conf, outdir,
generate_insert_move_sql_short, samlet_flyt_indhold)
except Exception as e:
logger.error(f"[DDL] Fejl for {tabel}: {e}")
raise
# ---------------------------------------------------------
# Combined files
# ---------------------------------------------------------
if antal > 0:
samlet_sti = os.path.join(outdir, "sql_samlet.sql")
with open(samlet_sti, "w", encoding="utf-8") as f_alt:
f_alt.write("\n".join(samlet_sql_indhold))
logger.info(f"[DDL] Skrev samlet fil: {samlet_sti}")
if (getattr(args, "flyt", False) or getattr(args, "flyt_kort", False)) and len(samlet_flyt_indhold) > 0:
samlet_flyt_sti = os.path.join(outdir, "sql_flyt_samlet.sql")
with open(samlet_flyt_sti, "w", encoding="utf-8") as f_flyt_alt:
f_flyt_alt.write("\n".join(samlet_flyt_indhold))
logger.info(f"[FLYT] Skrev samlet fil: {samlet_flyt_sti}")
logger.info(f"[DDL] FÆRDIG: {antal} fil(er) genereret.")
else:
logger.info("[DDL] Ingen DDL genereret.")

View File

@@ -0,0 +1,171 @@
import re
from typing import Any, Optional
from udpak_semistruktur.logger import hent_logger
from udpak_semistruktur.utils import er_tom, evaluer_værdi_token, EMPTY_SENTINELS
from udpak_semistruktur.extract.traversal import (
rekursiv_udpakning,
hent_objekt_fra_sti,
_resolve_with_indices,
matcher_hvis_findes,
)
logger = hent_logger(__name__)
def udvid_rod_alternativer(rod: str) -> list[dict]:
"""Udvider en rod-streng med [a,b,c]-alternativ-syntaks til en liste af rod/variant-dicts."""
if not isinstance(rod, str) or "[" not in rod:
return [{"rod": rod, "variant": None}]
m = re.fullmatch(r"(.*)\[([^\[\]]+)\](.*)", rod)
if not m:
return [{"rod": rod, "variant": None}]
prefix = m.group(1)
choices = [x.strip() for x in m.group(2).split(",") if x.strip()]
suffix = m.group(3)
return [
{
"rod": f"{prefix}{choice}{suffix}",
"variant": choice
}
for choice in choices
]
def hent_fra_spec(spec, default_el, json_data, sti_index, global_config) -> tuple[Any, bool]:
"""Henter en værdi fra json_data baseret på en spec-definition. Returnerer (værdi, mangler)."""
if spec is None:
return None, False
if "værdi" in spec and spec.get("værdi") is not None:
return evaluer_værdi_token(spec["værdi"], global_config), False
felt = spec.get("felt")
if felt == "@key":
return (sti_index or {}).get("__key"), False
rod = spec.get("rod")
if felt is None:
return None, True
if felt == ".":
return default_el, False
# Vælg rod/base
if rod:
parent_obj = hent_objekt_fra_sti(json_data, rod, sti_index)
if not isinstance(parent_obj, (dict, list)):
return None, True
kandidat = _resolve_with_indices(parent_obj, felt, sti_index, base_path=rod)
else:
if not isinstance(default_el, (dict, list)):
return None, True
# base_path er ukendt her; brug tom sti_index-nøgler vil stadig ramme korrekt,
# når felt-stien selv rammer lister (acc akkumuleres fra feltet).
kandidat = _resolve_with_indices(default_el, felt, sti_index, base_path="")
missing = kandidat is None and not isinstance(default_el, list)
return kandidat, missing
def hent_kolonne_værdi_med_fallback(kol: dict, el: Any, json_data: Any, sti_index: dict, global_config: dict) -> Any:
"""Henter kolonneværdi med støtte for missing_fallback og tom_fallback."""
kolnavn = kol.get("navn")
# 1) Primær kilde
if kol.get("felt") is None:
raw_value = kol.get("værdi", None)
værdi = evaluer_værdi_token(raw_value, global_config) if raw_value is not None else None
missing = raw_value is None
else:
primær_spec = {"felt": kol["felt"]}
if kol.get("rod"):
primær_spec["rod"] = kol["rod"]
værdi, missing = hent_fra_spec(primær_spec, el, json_data, sti_index, global_config)
# 2) Hvis missing -> prøv missing_fallback
if missing:
mf = kol.get("missing_fallback")
if isinstance(mf, dict):
v2, _ = hent_fra_spec(mf, el, json_data, sti_index, global_config)
if not er_tom(v2):
logger.debug(f"[UDTRÆK][FALLBACK][missing] kolonne={kolnavn} brugte missing_fallback={mf}")
return v2
# 3) Hvis tom -> prøv tom_fallback
if er_tom(værdi):
tf = kol.get("tom_fallback")
if isinstance(tf, dict):
v3, _ = hent_fra_spec(tf, el, json_data, sti_index, global_config)
if not er_tom(v3):
logger.debug(f"[UDTRÆK][FALLBACK][tom] kolonne={kolnavn} brugte tom_fallback={tf}")
return v3
return værdi
def generer_datafil(json_data: Any, yaml_config: dict, global_config: dict) -> dict:
"""
Udtrækker rækker fra json_data baseret på yaml_config.
Returnerer dict med 'header' og 'rækker'.
"""
output_filer = {}
rod_sti = yaml_config["rod"]
kolonner = yaml_config["kolonner"]
hvis_findes = yaml_config.get("hvis_findes")
# Særligt robust for dict-af-dicts ved rod="*"
if rod_sti in ("*", ".*") and isinstance(json_data, dict):
# Gem nøglen i sti_index["__key"] så @key kan bruges
objekter = [(v, {"__key": k}) for k, v in json_data.items()]
else:
objekter = []
for rv in udvid_rod_alternativer(rod_sti):
for element, sti_index in rekursiv_udpakning(json_data, rv["rod"]):
ny_sti_index = dict(sti_index or {})
if rv.get("variant") is not None:
ny_sti_index["__rod_variant"] = rv["variant"]
ny_sti_index["__rod_path"] = rv["rod"]
objekter.append((element, ny_sti_index))
rækker = []
header = [k["navn"] for k in kolonner]
for element, sti_index in objekter:
if element in EMPTY_SENTINELS:
continue
elementer = element if isinstance(element, list) else [element]
for el in elementer:
if el in EMPTY_SENTINELS:
continue
if hvis_findes and not matcher_hvis_findes(el, hvis_findes, sti_index):
logger.debug(f"[UDTRÆK][hvis_findes] springer over record; ingen af stierne findes: {hvis_findes}")
continue
base_række = {}
for kol in kolonner:
navn = kol.get("navn")
ktype = kol.get("type")
if ktype == "rod_variant":
base_række[navn] = (sti_index or {}).get("__rod_variant")
elif ktype == "rod_path":
base_række[navn] = (sti_index or {}).get("__rod_path")
elif kol.get("felt") == ".":
base_række[navn] = el
else:
base_række[navn] = hent_kolonne_værdi_med_fallback(
kol, el, json_data, sti_index, global_config
)
rækker.append(base_række)
output_filer['header'] = header
output_filer['rækker'] = rækker
return output_filer

View File

@@ -143,3 +143,16 @@ def rens(data: dict, file_config: dict, global_config: dict) -> dict:
række[kolonnenavn] = tmp række[kolonnenavn] = tmp
return data return data
def filename(data: dict, file_config: dict, global_config: dict) -> dict:
"""Sætter kolonner af type 'file' til det aktuelle filnavn fra global_config."""
kolonner = file_config.get("kolonner", [])
for kolonne in kolonner:
felt_type = kolonne.get("type", None)
if felt_type == "file":
for række in data["rækker"]:
række[kolonne["navn"]] = global_config["current_file"]
return data

View File

@@ -0,0 +1,54 @@
import hashlib
import json
from udpak_semistruktur.logger import hent_logger
logger = hent_logger(__name__)
def beregn_hash(data: dict, file_config: dict, global_config: dict) -> dict:
"""
Beregner en hash-værdi for hash-kolonner baseret på de øvrige kolonners værdier.
Algoritme, separator og ekskluderede kolonner styres fra YAML.
"""
hash_kolonner = []
kolonner = file_config.get("kolonner", [])
for kolonne in kolonner:
felt_type = kolonne.get("type", None)
if felt_type == "hash":
algoritme = kolonne.get("hash_algoritme", "sha256")
separator = kolonne.get("separator", "|")
exclude = kolonne.get("hash_exclude",[])
valid_algorithms = hashlib.algorithms_guaranteed
if algoritme not in valid_algorithms:
raise ValueError(f"Forkert HASH-algoritme: '{algoritme}'. Gyldige værdier: {', '.join(valid_algorithms)}")
for kol in kolonner:
json_felt = kol.get("navn", None)
if json_felt not in exclude:
felt_type = kol.get("type", None)
if felt_type not in ["hash", "id"]:
hash_kolonner.append(kol["navn"])
for række in data["rækker"]:
værdier = []
for navn in hash_kolonner:
værdi = række.get(navn)
if isinstance(værdi, (dict, list)):
værdi_str = json.dumps(værdi, sort_keys=True)
else:
værdi_str = str(værdi)
værdier.append(værdi_str)
samlet_streng = separator.join(værdier).encode("utf-8")
hash_func = getattr(hashlib, algoritme.lower())
række[kolonne["navn"]] = hash_func(samlet_streng).hexdigest()
return data