import os
import re
import json
import shutil
from pathlib import Path
from datetime import datetime
import cv2
import pandas as pd
from PIL import Image
import pytesseract
# --- HEIC support (optional) ---
try:
import pillow_heif # type: ignore
pillow_heif.register_heif_opener()
HEIC_OK = True
except Exception:
HEIC_OK = False
# -----------------------------
# Image preprocessing: "make it like their clean scan"
# -----------------------------
def preprocess_image_to_png(input_path: Path, out_png: Path) -> None:
"""
Loads image (jpg/png/heic/whatever PIL can read), makes clean high-contrast B/W,
writes PNG for stable OCR.
"""
img_pil = Image.open(input_path).convert("RGB")
img = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
# grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# denoise a bit
gray = cv2.bilateralFilter(gray, 9, 75, 75)
# adaptive threshold (works better than fixed on uneven scans)
bw = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 35, 11
)
# upscale (helps OCR on small text)
bw = cv2.resize(bw, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC)
out_png.parent.mkdir(parents=True, exist_ok=True)
cv2.imwrite(str(out_png), bw)
def ocr_text(clean_png: Path) -> str:
"""
OCR in Russian.
"""
config = "--psm 6"
text = pytesseract.image_to_string(Image.open(clean_png), lang="rus", config=config)
# normalize spaces
text = re.sub(r"[ \t]+", " ", text)
return text.strip()
# -----------------------------
# Parsing heuristics
# -----------------------------
DATE_RE = re.compile(r"(\d{2}\.\d{2}\.\d{4})")
NUM_RE = re.compile(r"(?:Счет[- ]?фактура|Сч[её]т[- ]?фактура|Счет[- ]?фактура\s*№|Сч[её]т[- ]?фактура\s*№)\s*№?\s*([0-9A-Za-zА-Яа-я\-\/]+)", re.IGNORECASE)
MONEY_RE = re.compile(r"(\d[\d \u00A0]*,\d{2})") # 14 337,80
INT_RE = re.compile(r"^\d+$")
def _money_to_float(s: str) -> float:
s = s.replace("\u00A0", " ").replace(" ", "").replace(",", ".")
try:
return float(s)
except Exception:
return 0.0
def parse_header_fields(text: str) -> dict:
"""
Extract basic fields: number, date, seller, buyer, total, nds_total (best effort).
"""
doc = {
"type": "УПД",
"direction": "Поступление",
"source": "Счет-фактура (скан/OCR)",
"number": None,
"date": None,
"seller": None,
"buyer": None,
"total": None,
"nds_total": None,
"raw": {}
}
# number
m = NUM_RE.search(text)
if m:
doc["number"] = m.group(1).strip()
# date: prefer near header (first date in text)
d = DATE_RE.search(text)
if d:
doc["date"] = d.group(1)
# seller / buyer (very rough)
# often "Продавец:" and "Покупатель:"
seller = re.search(r"Продавец[:\s]+(.+)", text)
buyer = re.search(r"Покупатель[:\s]+(.+)", text)
if seller:
doc["seller"] = seller.group(1).strip()[:120]
if buyer:
doc["buyer"] = buyer.group(1).strip()[:120]
# totals: try to take the largest money number as "total"
monies = MONEY_RE.findall(text)
if monies:
vals = [_money_to_float(x) for x in monies]
if vals:
doc["total"] = round(max(vals), 2)
# VAT total: try to find "НДС" near money
nds_candidates = re.findall(r"НДС[^0-9]*?(\d[\d \u00A0]*,\d{2})", text)
if nds_candidates:
# often there are multiple; take the biggest
nds_vals = [ _money_to_float(x) for x in nds_candidates ]
doc["nds_total"] = round(max(nds_vals), 2) if nds_vals else None
doc["raw"]["money_samples"] = monies[:30]
return doc
def parse_items_best_effort(text: str) -> list[dict]:
"""
For scanned счет-фактура/УПД table parsing is hard.
We'll do a best-effort line parser:
- look for lines that contain quantity + price + sum (at least 3 numbers)
- capture a "name chunk" before numbers
Works surprisingly OK on many UПД scans after good binarization.
"""
items = []
lines = [l.strip() for l in text.splitlines() if l.strip()]
for line in lines:
# quick skip
if len(line) < 10:
continue
# extract money/nums
money = MONEY_RE.findall(line)
# also capture integers/floats like qty and code
nums = re.findall(r"(\d+[.,]\d+|\d+)", line)
# heuristic: line with at least 3 numeric tokens and at least 1 money token
if len(nums) >= 3 and len(money) >= 1:
# Attempt: name = left part before the first big numeric sequence
split = re.split(r"\s\d", line, maxsplit=1)
name = split[0].strip()
# try to guess qty/price/sum from tokens (best effort)
# pick last two money tokens as (nds?, total?) - but not always.
# We'll store raw tokens + try simplest:
qty = None
price = None
amount = None
# try to locate something like "... 25,000 ... 249,17 ... 6 229,17"
# choose first float-like as qty, next as price, last money as amount
floats = [x for x in nums if ("," in x or "." in x)]
if len(floats) >= 1:
# first float often qty
qty = float(floats[0].replace(",", "."))
if len(money) >= 1:
amount = _money_to_float(money[-1])
if len(money) >= 2:
# price often earlier money in line
price = _money_to_float(money[0])
if name and (qty is not None or amount is not None):
items.append({
"name": name[:200],
"qty": qty,
"unit": None,
"price": price,
"sum": amount,
"vat": None,
"raw_line": line
})
# de-duplicate by raw_line
uniq = []
seen = set()
for it in items:
key = it["raw_line"]
if key not in seen:
seen.add(key)
uniq.append(it)
return uniq
# -----------------------------
# Pipeline over folder
# -----------------------------
def process_file(file_path: Path, out_dir: Path) -> None:
base = file_path.stem
doc_out = out_dir / base
doc_out.mkdir(parents=True, exist_ok=True)
clean_png = doc_out / "sf_clean.png"
text_txt = doc_out / "text.txt"
json_out = doc_out / "doc.json"
items_csv = doc_out / "items.csv"
# 1) preprocess
preprocess_image_to_png(file_path, clean_png)
# 2) OCR
text = ocr_text(clean_png)
text_txt.write_text(text, encoding="utf-8")
# 3) parse
doc = parse_header_fields(text)
items = parse_items_best_effort(text)
doc["items"] = items
json_out.write_text(json.dumps(doc, ensure_ascii=False, indent=2), encoding="utf-8")
# 4) table export
df = pd.DataFrame(items)
if not df.empty:
df.to_csv(items_csv, index=False, encoding="utf-8-sig")
else:
items_csv.write_text("name,qty,unit,price,sum,vat,raw_line\n", encoding="utf-8-sig")
def run_folder(in_dir: Path, out_dir: Path) -> None:
out_dir.mkdir(parents=True, exist_ok=True)
files = []
for ext in (".jpg", ".jpeg", ".png", ".tif", ".tiff", ".heic"):
files.extend(in_dir.glob(f"*{ext}"))
files.extend(in_dir.glob(f"*{ext.upper()}"))
if not files:
print("Нет файлов в папке:", in_dir)
return
all_rows = []
for f in sorted(files):
print("Обрабатываю:", f.name)
process_file(f, out_dir)
# collect for global XLSX
doc_dir = out_dir / f.stem
csv_path = doc_dir / "items.csv"
try:
df = pd.read_csv(csv_path)
if not df.empty:
df.insert(0, "source_file", f.name)
all_rows.append(df)
except Exception:
pass
if all_rows:
big = pd.concat(all_rows, ignore_index=True)
big_path = out_dir / "_all_items.xlsx"
big.to_excel(big_path, index=False)
print("Готово! Общая таблица:", big_path)
if __name__ == "__main__":
import argparse
import numpy as np # local import to keep top clean
parser = argparse.ArgumentParser()
parser.add_argument("--in", dest="in_dir", required=True, help="Папка с фото/сканами счет-фактур")
parser.add_argument("--out", dest="out_dir", required=True, help="Папка вывода")
args = parser.parse_args()
run_folder(Path(args.in_dir), Path(args.out_dir))