УПД - СЧ

ЛОГОТИП

Целевое действие

import os import re import json import shutil from pathlib import Path from datetime import datetime import cv2 import pandas as pd from PIL import Image import pytesseract # --- HEIC support (optional) --- try: import pillow_heif # type: ignore pillow_heif.register_heif_opener() HEIC_OK = True except Exception: HEIC_OK = False # ----------------------------- # Image preprocessing: "make it like their clean scan" # ----------------------------- def preprocess_image_to_png(input_path: Path, out_png: Path) -> None: """ Loads image (jpg/png/heic/whatever PIL can read), makes clean high-contrast B/W, writes PNG for stable OCR. """ img_pil = Image.open(input_path).convert("RGB") img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR) # grayscale gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # denoise a bit gray = cv2.bilateralFilter(gray, 9, 75, 75) # adaptive threshold (works better than fixed on uneven scans) bw = cv2.adaptiveThreshold( gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11 ) # upscale (helps OCR on small text) bw = cv2.resize(bw, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC) out_png.parent.mkdir(parents=True, exist_ok=True) cv2.imwrite(str(out_png), bw) def ocr_text(clean_png: Path) -> str: """ OCR in Russian. """ config = "--psm 6" text = pytesseract.image_to_string(Image.open(clean_png), lang="rus", config=config) # normalize spaces text = re.sub(r"[ \t]+", " ", text) return text.strip() # ----------------------------- # Parsing heuristics # ----------------------------- DATE_RE = re.compile(r"(\d{2}\.\d{2}\.\d{4})") NUM_RE = re.compile(r"(?:Счет[- ]?фактура|Сч[её]т[- ]?фактура|Счет[- ]?фактура\s*№|Сч[её]т[- ]?фактура\s*№)\s*№?\s*([0-9A-Za-zА-Яа-я\-\/]+)", re.IGNORECASE) MONEY_RE = re.compile(r"(\d[\d \u00A0]*,\d{2})") # 14 337,80 INT_RE = re.compile(r"^\d+$") def _money_to_float(s: str) -> float: s = s.replace("\u00A0", " ").replace(" ", "").replace(",", ".") try: return float(s) except Exception: return 0.0 def parse_header_fields(text: str) -> dict: """ Extract basic fields: number, date, seller, buyer, total, nds_total (best effort). """ doc = { "type": "УПД", "direction": "Поступление", "source": "Счет-фактура (скан/OCR)", "number": None, "date": None, "seller": None, "buyer": None, "total": None, "nds_total": None, "raw": {} } # number m = NUM_RE.search(text) if m: doc["number"] = m.group(1).strip() # date: prefer near header (first date in text) d = DATE_RE.search(text) if d: doc["date"] = d.group(1) # seller / buyer (very rough) # often "Продавец:" and "Покупатель:" seller = re.search(r"Продавец[:\s]+(.+)", text) buyer = re.search(r"Покупатель[:\s]+(.+)", text) if seller: doc["seller"] = seller.group(1).strip()[:120] if buyer: doc["buyer"] = buyer.group(1).strip()[:120] # totals: try to take the largest money number as "total" monies = MONEY_RE.findall(text) if monies: vals = [_money_to_float(x) for x in monies] if vals: doc["total"] = round(max(vals), 2) # VAT total: try to find "НДС" near money nds_candidates = re.findall(r"НДС[^0-9]*?(\d[\d \u00A0]*,\d{2})", text) if nds_candidates: # often there are multiple; take the biggest nds_vals = [ _money_to_float(x) for x in nds_candidates ] doc["nds_total"] = round(max(nds_vals), 2) if nds_vals else None doc["raw"]["money_samples"] = monies[:30] return doc def parse_items_best_effort(text: str) -> list[dict]: """ For scanned счет-фактура/УПД table parsing is hard. We'll do a best-effort line parser: - look for lines that contain quantity + price + sum (at least 3 numbers) - capture a "name chunk" before numbers Works surprisingly OK on many UПД scans after good binarization. """ items = [] lines = [l.strip() for l in text.splitlines() if l.strip()] for line in lines: # quick skip if len(line) < 10: continue # extract money/nums money = MONEY_RE.findall(line) # also capture integers/floats like qty and code nums = re.findall(r"(\d+[.,]\d+|\d+)", line) # heuristic: line with at least 3 numeric tokens and at least 1 money token if len(nums) >= 3 and len(money) >= 1: # Attempt: name = left part before the first big numeric sequence split = re.split(r"\s\d", line, maxsplit=1) name = split[0].strip() # try to guess qty/price/sum from tokens (best effort) # pick last two money tokens as (nds?, total?) - but not always. # We'll store raw tokens + try simplest: qty = None price = None amount = None # try to locate something like "... 25,000 ... 249,17 ... 6 229,17" # choose first float-like as qty, next as price, last money as amount floats = [x for x in nums if ("," in x or "." in x)] if len(floats) >= 1: # first float often qty qty = float(floats[0].replace(",", ".")) if len(money) >= 1: amount = _money_to_float(money[-1]) if len(money) >= 2: # price often earlier money in line price = _money_to_float(money[0]) if name and (qty is not None or amount is not None): items.append({ "name": name[:200], "qty": qty, "unit": None, "price": price, "sum": amount, "vat": None, "raw_line": line }) # de-duplicate by raw_line uniq = [] seen = set() for it in items: key = it["raw_line"] if key not in seen: seen.add(key) uniq.append(it) return uniq # ----------------------------- # Pipeline over folder # ----------------------------- def process_file(file_path: Path, out_dir: Path) -> None: base = file_path.stem doc_out = out_dir / base doc_out.mkdir(parents=True, exist_ok=True) clean_png = doc_out / "sf_clean.png" text_txt = doc_out / "text.txt" json_out = doc_out / "doc.json" items_csv = doc_out / "items.csv" # 1) preprocess preprocess_image_to_png(file_path, clean_png) # 2) OCR text = ocr_text(clean_png) text_txt.write_text(text, encoding="utf-8") # 3) parse doc = parse_header_fields(text) items = parse_items_best_effort(text) doc["items"] = items json_out.write_text(json.dumps(doc, ensure_ascii=False, indent=2), encoding="utf-8") # 4) table export df = pd.DataFrame(items) if not df.empty: df.to_csv(items_csv, index=False, encoding="utf-8-sig") else: items_csv.write_text("name,qty,unit,price,sum,vat,raw_line\n", encoding="utf-8-sig") def run_folder(in_dir: Path, out_dir: Path) -> None: out_dir.mkdir(parents=True, exist_ok=True) files = [] for ext in (".jpg", ".jpeg", ".png", ".tif", ".tiff", ".heic"): files.extend(in_dir.glob(f"*{ext}")) files.extend(in_dir.glob(f"*{ext.upper()}")) if not files: print("Нет файлов в папке:", in_dir) return all_rows = [] for f in sorted(files): print("Обрабатываю:", f.name) process_file(f, out_dir) # collect for global XLSX doc_dir = out_dir / f.stem csv_path = doc_dir / "items.csv" try: df = pd.read_csv(csv_path) if not df.empty: df.insert(0, "source_file", f.name) all_rows.append(df) except Exception: pass if all_rows: big = pd.concat(all_rows, ignore_index=True) big_path = out_dir / "_all_items.xlsx" big.to_excel(big_path, index=False) print("Готово! Общая таблица:", big_path) if __name__ == "__main__": import argparse import numpy as np # local import to keep top clean parser = argparse.ArgumentParser() parser.add_argument("--in", dest="in_dir", required=True, help="Папка с фото/сканами счет-фактур") parser.add_argument("--out", dest="out_dir", required=True, help="Папка вывода") args = parser.parse_args() run_folder(Path(args.in_dir), Path(args.out_dir))