diff --git a/udpak_semistruktur.py b/udpak_semistruktur.py index e69de29..5e0b79c 100644 --- a/udpak_semistruktur.py +++ b/udpak_semistruktur.py @@ -0,0 +1,126 @@ +import argparse + +from udpak_semistruktur.config import valider_yaml +from udpak_semistruktur.logger import opsaet_logging, hent_logger +from udpak_semistruktur import ddl + +from udpak_semistruktur.extract.reader import læs_filer +from udpak_semistruktur.extract.extractor import generer_datafil +from udpak_semistruktur.transform.clean import rens, tag_strip, fjern_linjeskift, upper_lower, filename +from udpak_semistruktur.transform.reshape import flatten, join, where, id_felt, sammensat_noegle +from udpak_semistruktur.transform.convert import konverter +from udpak_semistruktur.transform.hash import beregn_hash +from udpak_semistruktur.load.file_writer import generer_filer_med_overskrifter, skriv_fil_med_retry +from udpak_semistruktur.load.db_writer import get_ase_connection_windows, insert_rows_ase +from udpak_semistruktur.db import læs_json_fil +from udpak_semistruktur.utils import generer_filnavn + +logger = hent_logger(__name__) + +def _byg_argument_parser() -> argparse.ArgumentParser: + """Bygger og returnerer CLI argument-parseren.""" + + parser = argparse.ArgumentParser( + description="Udtræk og transformation af semistrukturerede data (JSON/XML)." + ) + parser.add_argument("--config", required=True, help="Sti til YAML-konfigurationsfil") + + # Tilføj DDL-flags via ddl-modulet + ddl.add_cli_args(parser) + return parser + +def _kør_udtræk(config: dict, global_config: dict) -> None: + """Kører den normale udtræks- og transformationspipeline.""" + + input_fil = global_config.get("input_fil") + input_fil_liste = global_config.get("input_fil_liste") + + # Opret DB-forbindelse hvis der er tabel-output + har_tabel_output = any( + cfg.get("type") == "tabel" + for cfg in config.get("output_filer", []) + ) + conn = None + if har_tabel_output: + bruger, password, env, host, port = læs_json_fil(global_config) + conn = get_ase_connection_windows( + bruger, password, host, port, + global_config["database"] + ) + + for record in læs_filer(global_config, input_fil, input_fil_liste): + for cfg in config.get("output_filer", []): + # 1) Udtræk + tmp_data = generer_datafil(record, cfg, global_config) + + # 2) Transform-pipeline + tmp_data = join(tmp_data, cfg) + tmp_data = flatten(tmp_data, cfg) + tmp_data = rens(tmp_data, cfg, global_config) + tmp_data = tag_strip(tmp_data, cfg, global_config) + tmp_data = fjern_linjeskift(tmp_data, cfg, global_config) + tmp_data = where(tmp_data, cfg, global_config) + tmp_data = konverter(tmp_data, cfg, global_config) + tmp_data = upper_lower(tmp_data, cfg, global_config) + tmp_data = id_felt(tmp_data, cfg) + tmp_data = sammensat_noegle(tmp_data, cfg, global_config) + tmp_data = beregn_hash(tmp_data, cfg, global_config) + tmp_data = filename(tmp_data, cfg, global_config) + + # 3) Load – afhænger af cfg["type"] + if cfg.get("type") == "fil": + fil_navn = generer_filnavn(cfg["fil_navn"], global_config) + output_sti = global_config["output_path"] + fil_navn + + overskrifter = cfg.get("overskrifter", True) + generer_filer_med_overskrifter(overskrifter, output_sti, cfg["kolonner"], global_config) + + separator = global_config["separator"] + encoding = global_config["encoding"] + + # Bemærk: skriv() lukker over loop-variabler – kaldes straks af skriv_fil_med_retry + def skriv(): + with open(output_sti, "a", encoding=encoding) as f: + for række in tmp_data["rækker"]: + linje = separator.join( + str(række.get(k["navn"], "")) for k in cfg["kolonner"] + ) + f.write(linje + "\n") + + skriv_fil_med_retry(skriv, output_sti) + logger.info(f"Fil: {output_sti} skrevet ({len(tmp_data['rækker'])} rækker)") + + elif cfg.get("type") == "tabel": + kolonner = [k["navn"] for k in cfg["kolonner"]] + indsatte, fejlede = insert_rows_ase(conn, cfg["tabel_navn"], kolonner, tmp_data["rækker"]) + logger.info(f"DB: {indsatte} rækker indsat i {cfg['tabel_navn']}") + if fejlede: + logger.warning(f"DB: {len(fejlede)} rækker fejlede i {cfg['tabel_navn']}") + + if conn is not None: + conn.close() + logger.debug("DB-forbindelse lukket.") + +def main(): + """Hovedfunktion der eksekveres ved kørsel af scriptet.""" + parser = _byg_argument_parser() + args = parser.parse_args() + + # Valider og indlæs YAML-konfiguration + config = valider_yaml(args.config) + global_config = config["config"] + + # Opsæt logging baseret på argumenter + opsaet_logging( + log_fil=global_config["logfil"], + niveau=global_config.get("log_niveau", "info"), + ) + + # Eksekver DDL-flowet + if ddl.is_enabled(args): + ddl.run_ddl_mode(args, config, global_config) + else: + _kør_udtræk(config, global_config) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/udpak_semistruktur/ddl.py b/udpak_semistruktur/ddl.py index e69de29..6cc31d4 100644 --- a/udpak_semistruktur/ddl.py +++ b/udpak_semistruktur/ddl.py @@ -0,0 +1,387 @@ +""" +ddl_tool_ase.py + +Sybase ASE helper for: +- DDL generation (CREATE TABLE) for base + tmp tables +- Flyt scripts (DELETE with JOIN placeholder + INSERT/SELECT) +- Collecting combined sql files + +Design goals (as requested): +- YAML may contain _tmp table names (because the extractor loads into _tmp) +- DDL generation should produce BOTH: + - base table (without _tmp) + - tmp table (with _tmp) + when YAML ends with _tmp +- If YAML does NOT end with _tmp: + - always generate base DDL + - generate tmp DDL only when --tmp is given +- Flyt scripts always move tmp -> base +- Sybase ASE DELETE must NOT use table aliases +- No join_cols intelligence; only placeholders (ON 1=1 + commented AND lines) +""" + +from __future__ import annotations +import os + +from copy import deepcopy +from typing import Dict, List, Tuple, Any + +from udpak_semistruktur.utils import generer_filnavn +from udpak_semistruktur.logger import hent_logger + +logger = hent_logger(__name__) + +# ------------------------------------------------------------ +# CLI wiring +# ------------------------------------------------------------ + +def add_cli_args(parser) -> None: + """ + Add CLI flags used by DDL mode to your argparse parser. + """ + parser.add_argument("--DDL", action="store_true", help="Generate DDL (CREATE TABLE) files") + parser.add_argument("--tmp", action="store_true", help="Also generate _tmp table variants when YAML table is base") + parser.add_argument("--flyt", action="store_true", help="Also generate delete+insert (move) scripts for Sybase ASE") + parser.add_argument("--flyt_kort", action="store_true", help="Generere delete+insert statements, men i kort version.") + + +def is_enabled(args) -> bool: + return bool(getattr(args, "DDL", False)) + + +# ------------------------------------------------------------ +# Name helpers +# ------------------------------------------------------------ + +def _map_yaml_type_to_ase(col: dict, dato_ud_global = "%Y-%m-%d") -> str: + """ + Map YAML kolonnefelt til Sybase ASE SQL-type. + YAML keys vi kigger efter: + - type: string|integer|float|decimal|boolean|date|hash|id|file + - max_længde / length: int (til varchar) + - precision: int (til decimal) + - decimaler: int (scale til decimal) + Fallback for ukendt/uden type: VARCHAR(255) + """ + t = str(col.get("type", "string")).lower() + length = col.get("max_længde", col.get("length", col.get("truncate"))) + precision = col.get("precision", 18) + scale = col.get("decimaler", 2) + dato_fmt = col.get("dato_ud", dato_ud_global) + + if t in ("string", "hash", "id", "file"): + n = int(length) if length else 50 + return f"VARCHAR({n})" + if t in ("integer", "bigint"): + return "INT" + if t in ("float", "decimal"): + return f"DECIMAL({precision},{scale})" + if t == "boolean": + # BIT findes, men TINYINT er ofte mere kompatibelt i ASE + return "TINYINT" + if t == "date": + time_codes = ['%H', '%M', '%S', '%I', '%p', '%f'] + if (any(code in dato_fmt for code in time_codes)) or (dato_fmt.upper() == 'SYBASE'): + return "DATETIME" + return "DATE" + + # default fallback + return "VARCHAR(50)" + + +def split_base_tmp(table_name: str) -> Tuple[str, str]: + """ + Returns (base_table, tmp_table). + + If table_name ends with _tmp: + base_table = table_name without _tmp + tmp_table = table_name (as-is) + Else: + base_table = table_name + tmp_table = table_name + _tmp + """ + if table_name.lower().endswith("_tmp"): + return table_name[:-4], table_name + return table_name, f"{table_name}_tmp" + + +def _safe_name(name: str) -> str: + return name.replace(".", "_") + + +def _default_ddl_filename(table_name: str) -> str: + return f"{_safe_name(table_name)}_create.sql" + + +def _default_delete_filename(base_table: str) -> str: + return f"{_safe_name(base_table)}_delete.sql" + + +def _default_flyt_filename(base_table: str) -> str: + return f"{_safe_name(base_table)}_flyt.sql" + +def _skriv_flyt_scripts( + tabel: str, + base_tabel: str, + tmp_tabel: str, + file_conf: dict, + outdir: str, + insert_func: callable, + samlet_flyt_indhold: list, +) -> None: + """Genererer og skriver delete- og flyt-scripts for én tabel.""" + kolonner = [kol["navn"] for kol in file_conf.get("kolonner", [])] + delete_sql = generate_delete_join_sql(tabel, kolonner) + flyt_sql = insert_func(tabel, kolonner) + + delete_path = os.path.join(outdir, _default_delete_filename(base_tabel)) + flyt_path = os.path.join(outdir, _default_flyt_filename(base_tabel)) + + with open(delete_path, "w", encoding="utf-8") as f: + f.write(delete_sql) + with open(flyt_path, "w", encoding="utf-8") as f: + f.write(flyt_sql) + + samlet_flyt_indhold.append(f"-- DELETE: {base_tabel} (match mod {tmp_tabel})\n{delete_sql}\n") + samlet_flyt_indhold.append(f"-- FLYT: {tmp_tabel} -> {base_tabel}\n{flyt_sql}\n") + + logger.info(f"[FLYT] Skrev {delete_path}") + logger.info(f"[FLYT] Skrev {flyt_path}") + + +# ------------------------------------------------------------ +# SQL generators (Sybase ASE) +# ------------------------------------------------------------ + +def generate_create_table_sql(file_config: dict, global_config: dict) -> str: + """ + Genererer CREATE TABLE DDL for én output-fil-konfiguration. + Kræver at file_config['tabel_navn'] er sat. + - Kolonnenavn = kol["navn"] + - Type = YAML 'type' -> ASE type (fallback VARCHAR(255)) + - NULL/NOT NULL = 'påkrævet' (True => NOT NULL) + - Optional 'primary_key': true (på en eller flere kolonner) -> PRIMARY KEY + Returnerer en streng med DROP-IF-EXISTS + CREATE TABLE. + """ + table = file_config.get("tabel_navn") + if not table: + raise ValueError("Kan ikke generere DDL: 'tabel_navn' mangler i output_filer-blok.") + + cols = file_config.get("kolonner", []) + if not cols: + raise ValueError(f"Kan ikke generere DDL for {table}: 'kolonner' er tom.") + + # Byg kolonne-linjer + col_lines = [] + pk_cols = [] + for col in cols: + name = col["navn"] + sql_type = _map_yaml_type_to_ase(col) + not_null = "NOT NULL" if col.get("påkrævet") else "NULL" + col_lines.append(f' "{name}" {sql_type} {not_null}') + if col.get("primary_key"): + pk_cols.append(name) + + # PRIMARY KEY (hvis angivet) + pk_line = "" + if pk_cols: + cols_list = ", ".join(f'"{c}"' for c in pk_cols) + pk_line = f",\n PRIMARY KEY ({cols_list})" + + cols_block = ",\n".join(col_lines) + pk_line + + # DROP IF EXISTS til ASE (sysobjects) + # (Tilpas evt. schema-adskillelse; her antager vi at table kan være dbo.MinTabel) + schema_qualified = table + table_only = table.split(".")[-1] + + drop_part = ( + f"IF EXISTS (SELECT 1 FROM sysobjects WHERE name = '{table_only}' AND type = 'U')\n" + f"BEGIN\n" + f" DROP TABLE {schema_qualified}\n" + f"END\nGO\n\n" + ) + + create_part = ( + f"CREATE TABLE {schema_qualified} (\n" + f"{cols_block}\n" + f");\nGO\n" + ) + + return drop_part + create_part + + +def generate_delete_join_sql(table_name_from_yaml: str, columns: List[str]) -> str: + """ + Sybase ASE-compatible DELETE with JOIN placeholder. + + IMPORTANT: No aliases in DELETE in Sybase ASE. + + Output: + DELETE FROM base + FROM base + JOIN tmp + ON 1 = 1 + -- AND base.col = tmp.col + + """ + base_table, tmp_table = split_base_tmp(table_name_from_yaml) + + lines: List[str] = [] + lines.append(f"DELETE FROM {base_table}") + lines.append(f"FROM {base_table}") + lines.append(f"JOIN {tmp_table}") + lines.append(" ON 1 = 1") + for col in columns: + lines.append(f" -- AND {base_table}.{col} = {tmp_table}.{col}") + lines.append(" ") + return "\n".join(lines) + + +def generate_insert_move_sql(table_name_from_yaml: str, columns: List[str]) -> str: + """ + INSERT INTO base SELECT FROM tmp (all columns, explicit list). + """ + base_table, tmp_table = split_base_tmp(table_name_from_yaml) + + cols_block = ",\n ".join(columns) + + lines: List[str] = [] + lines.append(f"INSERT INTO {base_table} (") + lines.append(f" {cols_block}") + lines.append(")") + lines.append("SELECT") + lines.append(f" {cols_block}") + lines.append(f"FROM {tmp_table}") + lines.append(" ") + return "\n".join(lines) + +def generate_insert_move_sql_short(table_name_from_yaml: str, columns: List[str]) -> str: + """ + INSERT INTO base SELECT FROM tmp (*). + """ + base_table, tmp_table = split_base_tmp(table_name_from_yaml) + + cols_block = ",\n ".join(columns) + + lines: List[str] = [] + lines.append(f"INSERT INTO {base_table} ") + lines.append("SELECT * ") + lines.append(f"FROM {tmp_table}") + lines.append(" ") + return "\n".join(lines) + +# ------------------------------------------------------------ +# DDL mode runner +# ------------------------------------------------------------ + +def run_ddl_mode(args, config: Dict[str, Any], global_config: Dict[str, Any]) -> None: + """ + Executes the full DDL-only flow and writes files. + """ + + outdir = os.path.join(global_config["output_path"], "sql") + os.makedirs(outdir, exist_ok=True) + + antal = 0 + samlet_sql_indhold: List[str] = [] + samlet_flyt_indhold: List[str] = [] + + output_filer = config.get("output_filer", []) + for file_conf in output_filer: + tabel = file_conf.get("tabel_navn") + if not tabel: + continue + + try: + base_tabel, tmp_tabel = split_base_tmp(tabel) + yaml_is_tmp = tabel.lower().endswith("_tmp") + + # --------------------------------------------------------- + # 1) Base DDL (always) + # --------------------------------------------------------- + base_conf = deepcopy(file_conf) + base_conf["tabel_navn"] = base_tabel + + ddl_sql_base = generate_create_table_sql(base_conf, global_config) + samlet_sql_indhold.append(f"-- Tabel: {base_tabel}\n{ddl_sql_base}\n") + + # If YAML already is base, respect ddl_fil_navn if present. + # If YAML is tmp (base differs), don't reuse ddl_fil_navn blindly. + if (not yaml_is_tmp) and file_conf.get("ddl_fil_navn"): + ddl_base_name = file_conf["ddl_fil_navn"] + else: + ddl_base_name = _default_ddl_filename(base_tabel) + + ddl_base_name = generer_filnavn(ddl_base_name, global_config) + ddl_base_path = os.path.join(outdir, ddl_base_name) + + with open(ddl_base_path, "w", encoding="utf-8") as f: + f.write(ddl_sql_base) + + logger.info(f"[DDL] Skrev {ddl_base_path}") + antal += 1 + + # --------------------------------------------------------- + # 2) TMP DDL + # - If YAML is _tmp: ALWAYS generate tmp too + # - Else: only when --tmp is set + # --------------------------------------------------------- + skal_lave_tmp = yaml_is_tmp or bool(getattr(args, "tmp", False)) + if skal_lave_tmp: + tmp_conf = deepcopy(file_conf) + tmp_conf["tabel_navn"] = tmp_tabel + + ddl_sql_tmp = generate_create_table_sql(tmp_conf, global_config) + samlet_sql_indhold.append(f"-- Tabel: {tmp_tabel}\n{ddl_sql_tmp}\n") + + ddl_tmp_name = _default_ddl_filename(tmp_tabel) + ddl_tmp_name = generer_filnavn(ddl_tmp_name, global_config) + ddl_tmp_path = os.path.join(outdir, ddl_tmp_name) + + with open(ddl_tmp_path, "w", encoding="utf-8") as f_tmp: + f_tmp.write(ddl_sql_tmp) + + logger.info(f"[DDL] Skrev {ddl_tmp_path}") + antal += 1 + + # --------------------------------------------------------- + # 3) Flyt scripts (Sybase ASE) if --flyt + # Always based on YAML table name, which determines base/tmp + # --------------------------------------------------------- + if getattr(args, "flyt", False): + _skriv_flyt_scripts(tabel, base_tabel, tmp_tabel, file_conf, outdir, + generate_insert_move_sql, samlet_flyt_indhold) + + # --------------------------------------------------------- + # 4) Flyt scripts (Sybase ASE) if --flyt_kort + # Always based on YAML table name, which determines base/tmp + # --------------------------------------------------------- + if getattr(args, "flyt_kort", False): + _skriv_flyt_scripts(tabel, base_tabel, tmp_tabel, file_conf, outdir, + generate_insert_move_sql_short, samlet_flyt_indhold) + + + except Exception as e: + logger.error(f"[DDL] Fejl for {tabel}: {e}") + raise + + # --------------------------------------------------------- + # Combined files + # --------------------------------------------------------- + if antal > 0: + samlet_sti = os.path.join(outdir, "sql_samlet.sql") + with open(samlet_sti, "w", encoding="utf-8") as f_alt: + f_alt.write("\n".join(samlet_sql_indhold)) + logger.info(f"[DDL] Skrev samlet fil: {samlet_sti}") + + if (getattr(args, "flyt", False) or getattr(args, "flyt_kort", False)) and len(samlet_flyt_indhold) > 0: + samlet_flyt_sti = os.path.join(outdir, "sql_flyt_samlet.sql") + with open(samlet_flyt_sti, "w", encoding="utf-8") as f_flyt_alt: + f_flyt_alt.write("\n".join(samlet_flyt_indhold)) + logger.info(f"[FLYT] Skrev samlet fil: {samlet_flyt_sti}") + + logger.info(f"[DDL] FÆRDIG: {antal} fil(er) genereret.") + else: + logger.info("[DDL] Ingen DDL genereret.") + diff --git a/udpak_semistruktur/extract/extractor.py b/udpak_semistruktur/extract/extractor.py new file mode 100644 index 0000000..c249919 --- /dev/null +++ b/udpak_semistruktur/extract/extractor.py @@ -0,0 +1,171 @@ +import re +from typing import Any, Optional + +from udpak_semistruktur.logger import hent_logger +from udpak_semistruktur.utils import er_tom, evaluer_værdi_token, EMPTY_SENTINELS +from udpak_semistruktur.extract.traversal import ( + rekursiv_udpakning, + hent_objekt_fra_sti, + _resolve_with_indices, + matcher_hvis_findes, +) + +logger = hent_logger(__name__) + +def udvid_rod_alternativer(rod: str) -> list[dict]: + """Udvider en rod-streng med [a,b,c]-alternativ-syntaks til en liste af rod/variant-dicts.""" + + if not isinstance(rod, str) or "[" not in rod: + return [{"rod": rod, "variant": None}] + + m = re.fullmatch(r"(.*)\[([^\[\]]+)\](.*)", rod) + if not m: + return [{"rod": rod, "variant": None}] + + prefix = m.group(1) + choices = [x.strip() for x in m.group(2).split(",") if x.strip()] + suffix = m.group(3) + return [ + { + "rod": f"{prefix}{choice}{suffix}", + "variant": choice + } + for choice in choices + ] + +def hent_fra_spec(spec, default_el, json_data, sti_index, global_config) -> tuple[Any, bool]: + """Henter en værdi fra json_data baseret på en spec-definition. Returnerer (værdi, mangler).""" + + if spec is None: + return None, False + + if "værdi" in spec and spec.get("værdi") is not None: + return evaluer_værdi_token(spec["værdi"], global_config), False + + felt = spec.get("felt") + if felt == "@key": + return (sti_index or {}).get("__key"), False + + rod = spec.get("rod") + + if felt is None: + return None, True + + if felt == ".": + return default_el, False + + # Vælg rod/base + if rod: + parent_obj = hent_objekt_fra_sti(json_data, rod, sti_index) + if not isinstance(parent_obj, (dict, list)): + return None, True + kandidat = _resolve_with_indices(parent_obj, felt, sti_index, base_path=rod) + else: + if not isinstance(default_el, (dict, list)): + return None, True + # base_path er ukendt her; brug tom – sti_index-nøgler vil stadig ramme korrekt, + # når felt-stien selv rammer lister (acc akkumuleres fra feltet). + kandidat = _resolve_with_indices(default_el, felt, sti_index, base_path="") + + missing = kandidat is None and not isinstance(default_el, list) + return kandidat, missing + +def hent_kolonne_værdi_med_fallback(kol: dict, el: Any, json_data: Any, sti_index: dict, global_config: dict) -> Any: + """Henter kolonneværdi med støtte for missing_fallback og tom_fallback.""" + + kolnavn = kol.get("navn") + # 1) Primær kilde + if kol.get("felt") is None: + raw_value = kol.get("værdi", None) + værdi = evaluer_værdi_token(raw_value, global_config) if raw_value is not None else None + missing = raw_value is None + else: + primær_spec = {"felt": kol["felt"]} + if kol.get("rod"): + primær_spec["rod"] = kol["rod"] + værdi, missing = hent_fra_spec(primær_spec, el, json_data, sti_index, global_config) + + # 2) Hvis missing -> prøv missing_fallback + if missing: + mf = kol.get("missing_fallback") + if isinstance(mf, dict): + v2, _ = hent_fra_spec(mf, el, json_data, sti_index, global_config) + if not er_tom(v2): + logger.debug(f"[UDTRÆK][FALLBACK][missing] kolonne={kolnavn} brugte missing_fallback={mf}") + return v2 + + # 3) Hvis tom -> prøv tom_fallback + if er_tom(værdi): + tf = kol.get("tom_fallback") + if isinstance(tf, dict): + v3, _ = hent_fra_spec(tf, el, json_data, sti_index, global_config) + if not er_tom(v3): + logger.debug(f"[UDTRÆK][FALLBACK][tom] kolonne={kolnavn} brugte tom_fallback={tf}") + return v3 + + return værdi + +def generer_datafil(json_data: Any, yaml_config: dict, global_config: dict) -> dict: + """ + Udtrækker rækker fra json_data baseret på yaml_config. + Returnerer dict med 'header' og 'rækker'. + """ + output_filer = {} + rod_sti = yaml_config["rod"] + kolonner = yaml_config["kolonner"] + + hvis_findes = yaml_config.get("hvis_findes") + + # Særligt robust for dict-af-dicts ved rod="*" + if rod_sti in ("*", ".*") and isinstance(json_data, dict): + # Gem nøglen i sti_index["__key"] så @key kan bruges + objekter = [(v, {"__key": k}) for k, v in json_data.items()] + else: + objekter = [] + for rv in udvid_rod_alternativer(rod_sti): + for element, sti_index in rekursiv_udpakning(json_data, rv["rod"]): + ny_sti_index = dict(sti_index or {}) + if rv.get("variant") is not None: + ny_sti_index["__rod_variant"] = rv["variant"] + ny_sti_index["__rod_path"] = rv["rod"] + objekter.append((element, ny_sti_index)) + + rækker = [] + header = [k["navn"] for k in kolonner] + + for element, sti_index in objekter: + if element in EMPTY_SENTINELS: + continue + + elementer = element if isinstance(element, list) else [element] + + for el in elementer: + if el in EMPTY_SENTINELS: + continue + + if hvis_findes and not matcher_hvis_findes(el, hvis_findes, sti_index): + logger.debug(f"[UDTRÆK][hvis_findes] springer over record; ingen af stierne findes: {hvis_findes}") + continue + + base_række = {} + + for kol in kolonner: + navn = kol.get("navn") + ktype = kol.get("type") + + if ktype == "rod_variant": + base_række[navn] = (sti_index or {}).get("__rod_variant") + elif ktype == "rod_path": + base_række[navn] = (sti_index or {}).get("__rod_path") + elif kol.get("felt") == ".": + base_række[navn] = el + else: + base_række[navn] = hent_kolonne_værdi_med_fallback( + kol, el, json_data, sti_index, global_config + ) + + rækker.append(base_række) + + output_filer['header'] = header + output_filer['rækker'] = rækker + return output_filer diff --git a/udpak_semistruktur/transform/clean.py b/udpak_semistruktur/transform/clean.py index 3636684..002c2e0 100644 --- a/udpak_semistruktur/transform/clean.py +++ b/udpak_semistruktur/transform/clean.py @@ -143,3 +143,16 @@ def rens(data: dict, file_config: dict, global_config: dict) -> dict: række[kolonnenavn] = tmp return data + +def filename(data: dict, file_config: dict, global_config: dict) -> dict: + """Sætter kolonner af type 'file' til det aktuelle filnavn fra global_config.""" + + kolonner = file_config.get("kolonner", []) + for kolonne in kolonner: + felt_type = kolonne.get("type", None) + + if felt_type == "file": + for række in data["rækker"]: + række[kolonne["navn"]] = global_config["current_file"] + + return data \ No newline at end of file diff --git a/udpak_semistruktur/transform/hash.py b/udpak_semistruktur/transform/hash.py new file mode 100644 index 0000000..2581fd1 --- /dev/null +++ b/udpak_semistruktur/transform/hash.py @@ -0,0 +1,54 @@ +import hashlib +import json + +from udpak_semistruktur.logger import hent_logger + +logger = hent_logger(__name__) + +def beregn_hash(data: dict, file_config: dict, global_config: dict) -> dict: + """ + Beregner en hash-værdi for hash-kolonner baseret på de øvrige kolonners værdier. + Algoritme, separator og ekskluderede kolonner styres fra YAML. + """ + + hash_kolonner = [] + + kolonner = file_config.get("kolonner", []) + for kolonne in kolonner: + felt_type = kolonne.get("type", None) + + if felt_type == "hash": + algoritme = kolonne.get("hash_algoritme", "sha256") + separator = kolonne.get("separator", "|") + exclude = kolonne.get("hash_exclude",[]) + + valid_algorithms = hashlib.algorithms_guaranteed + + if algoritme not in valid_algorithms: + raise ValueError(f"Forkert HASH-algoritme: '{algoritme}'. Gyldige værdier: {', '.join(valid_algorithms)}") + + for kol in kolonner: + json_felt = kol.get("navn", None) + if json_felt not in exclude: + felt_type = kol.get("type", None) + if felt_type not in ["hash", "id"]: + hash_kolonner.append(kol["navn"]) + + for række in data["rækker"]: + værdier = [] + for navn in hash_kolonner: + værdi = række.get(navn) + if isinstance(værdi, (dict, list)): + værdi_str = json.dumps(værdi, sort_keys=True) + else: + værdi_str = str(værdi) + + værdier.append(værdi_str) + + samlet_streng = separator.join(værdier).encode("utf-8") + + hash_func = getattr(hashlib, algoritme.lower()) + + række[kolonne["navn"]] = hash_func(samlet_streng).hexdigest() + + return data