import os import re import json import shutil from pathlib import Path from datetime import datetime import cv2 import pandas as pd from PIL import Image import pytesseract # --- HEIC support (optional) --- try: import pillow_heif # type: ignore pillow_heif.register_heif_opener() HEIC_OK = True except Exception: HEIC_OK = False # ----------------------------- # Image preprocessing: "make it like their clean scan" # ----------------------------- def preprocess_image_to_png(input_path: Path, out_png: Path) -> None: """ Loads image (jpg/png/heic/whatever PIL can read), makes clean high-contrast B/W, writes PNG for stable OCR. """ img_pil = Image.open(input_path).convert("RGB") img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR) # grayscale gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # denoise a bit gray = cv2.bilateralFilter(gray, 9, 75, 75) # adaptive threshold (works better than fixed on uneven scans) bw = cv2.adaptiveThreshold( gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11 ) # upscale (helps OCR on small text) bw = cv2.resize(bw, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC) out_png.parent.mkdir(parents=True, exist_ok=True) cv2.imwrite(str(out_png), bw) def ocr_text(clean_png: Path) -> str: """ OCR in Russian. """ config = "--psm 6" text = pytesseract.image_to_string(Image.open(clean_png), lang="rus", config=config) # normalize spaces text = re.sub(r"[ \t]+", " ", text) return text.strip() # ----------------------------- # Parsing heuristics # ----------------------------- DATE_RE = re.compile(r"(\d{2}\.\d{2}\.\d{4})") NUM_RE = re.compile(r"(?:Счет[- ]?фактура|Сч[её]т[- ]?фактура|Счет[- ]?фактура\s*№|Сч[её]т[- ]?фактура\s*№)\s*№?\s*([0-9A-Za-zА-Яа-я\-\/]+)", re.IGNORECASE) MONEY_RE = re.compile(r"(\d[\d \u00A0]*,\d{2})") # 14 337,80 INT_RE = re.compile(r"^\d+$") def _money_to_float(s: str) -> float: s = s.replace("\u00A0", " ").replace(" ", "").replace(",", ".") try: return float(s) except Exception: return 0.0 def parse_header_fields(text: str) -> dict: """ Extract basic fields: number, date, seller, buyer, total, nds_total (best effort). """ doc = { "type": "УПД", "direction": "Поступление", "source": "Счет-фактура (скан/OCR)", "number": None, "date": None, "seller": None, "buyer": None, "total": None, "nds_total": None, "raw": {} } # number m = NUM_RE.search(text) if m: doc["number"] = m.group(1).strip() # date: prefer near header (first date in text) d = DATE_RE.search(text) if d: doc["date"] = d.group(1) # seller / buyer (very rough) # often "Продавец:" and "Покупатель:" seller = re.search(r"Продавец[:\s]+(.+)", text) buyer = re.search(r"Покупатель[:\s]+(.+)", text) if seller: doc["seller"] = seller.group(1).strip()[:120] if buyer: doc["buyer"] = buyer.group(1).strip()[:120] # totals: try to take the largest money number as "total" monies = MONEY_RE.findall(text) if monies: vals = [_money_to_float(x) for x in monies] if vals: doc["total"] = round(max(vals), 2) # VAT total: try to find "НДС" near money nds_candidates = re.findall(r"НДС[^0-9]*?(\d[\d \u00A0]*,\d{2})", text) if nds_candidates: # often there are multiple; take the biggest nds_vals = [ _money_to_float(x) for x in nds_candidates ] doc["nds_total"] = round(max(nds_vals), 2) if nds_vals else None doc["raw"]["money_samples"] = monies[:30] return doc def parse_items_best_effort(text: str) -> list[dict]: """ For scanned счет-фактура/УПД table parsing is hard. We'll do a best-effort line parser: - look for lines that contain quantity + price + sum (at least 3 numbers) - capture a "name chunk" before numbers Works surprisingly OK on many UПД scans after good binarization. """ items = [] lines = [l.strip() for l in text.splitlines() if l.strip()] for line in lines: # quick skip if len(line) < 10: continue # extract money/nums money = MONEY_RE.findall(line) # also capture integers/floats like qty and code nums = re.findall(r"(\d+[.,]\d+|\d+)", line) # heuristic: line with at least 3 numeric tokens and at least 1 money token if len(nums) >= 3 and len(money) >= 1: # Attempt: name = left part before the first big numeric sequence split = re.split(r"\s\d", line, maxsplit=1) name = split[0].strip() # try to guess qty/price/sum from tokens (best effort) # pick last two money tokens as (nds?, total?) - but not always. # We'll store raw tokens + try simplest: qty = None price = None amount = None # try to locate something like "... 25,000 ... 249,17 ... 6 229,17" # choose first float-like as qty, next as price, last money as amount floats = [x for x in nums if ("," in x or "." in x)] if len(floats) >= 1: # first float often qty qty = float(floats[0].replace(",", ".")) if len(money) >= 1: amount = _money_to_float(money[-1]) if len(money) >= 2: # price often earlier money in line price = _money_to_float(money[0]) if name and (qty is not None or amount is not None): items.append({ "name": name[:200], "qty": qty, "unit": None, "price": price, "sum": amount, "vat": None, "raw_line": line }) # de-duplicate by raw_line uniq = [] seen = set() for it in items: key = it["raw_line"] if key not in seen: seen.add(key) uniq.append(it) return uniq # ----------------------------- # Pipeline over folder # ----------------------------- def process_file(file_path: Path, out_dir: Path) -> None: base = file_path.stem doc_out = out_dir / base doc_out.mkdir(parents=True, exist_ok=True) clean_png = doc_out / "sf_clean.png" text_txt = doc_out / "text.txt" json_out = doc_out / "doc.json" items_csv = doc_out / "items.csv" # 1) preprocess preprocess_image_to_png(file_path, clean_png) # 2) OCR text = ocr_text(clean_png) text_txt.write_text(text, encoding="utf-8") # 3) parse doc = parse_header_fields(text) items = parse_items_best_effort(text) doc["items"] = items json_out.write_text(json.dumps(doc, ensure_ascii=False, indent=2), encoding="utf-8") # 4) table export df = pd.DataFrame(items) if not df.empty: df.to_csv(items_csv, index=False, encoding="utf-8-sig") else: items_csv.write_text("name,qty,unit,price,sum,vat,raw_line\n", encoding="utf-8-sig") def run_folder(in_dir: Path, out_dir: Path) -> None: out_dir.mkdir(parents=True, exist_ok=True) files = [] for ext in (".jpg", ".jpeg", ".png", ".tif", ".tiff", ".heic"): files.extend(in_dir.glob(f"*{ext}")) files.extend(in_dir.glob(f"*{ext.upper()}")) if not files: print("Нет файлов в папке:", in_dir) return all_rows = [] for f in sorted(files): print("Обрабатываю:", f.name) process_file(f, out_dir) # collect for global XLSX doc_dir = out_dir / f.stem csv_path = doc_dir / "items.csv" try: df = pd.read_csv(csv_path) if not df.empty: df.insert(0, "source_file", f.name) all_rows.append(df) except Exception: pass if all_rows: big = pd.concat(all_rows, ignore_index=True) big_path = out_dir / "_all_items.xlsx" big.to_excel(big_path, index=False) print("Готово! Общая таблица:", big_path) if __name__ == "__main__": import argparse import numpy as np # local import to keep top clean parser = argparse.ArgumentParser() parser.add_argument("--in", dest="in_dir", required=True, help="Папка с фото/сканами счет-фактур") parser.add_argument("--out", dest="out_dir", required=True, help="Папка вывода") args = parser.parse_args() run_folder(Path(args.in_dir), Path(args.out_dir))
Made on
Tilda