From 41823dcbf8a6d5198ea4014f969eb4829436c916 Mon Sep 17 00:00:00 2001 From: GRayHook Date: Sun, 31 May 2026 16:16:51 +0700 Subject: [PATCH] init --- README.md | 74 ++++++++++ extract.py | 366 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 + run.sh | 25 ++++ 4 files changed, 468 insertions(+) create mode 100644 README.md create mode 100644 extract.py create mode 100644 requirements.txt create mode 100755 run.sh diff --git a/README.md b/README.md new file mode 100644 index 0000000..d37b029 --- /dev/null +++ b/README.md @@ -0,0 +1,74 @@ +# pdf2ai + +Model-free конвертер PDF → markdown для локального агента: главы по разделам, картинки рядом, grep по ``. + +## Зависимости + +- Python **3.10+** +- pip-пакеты из [`requirements.txt`](requirements.txt): `pymupdf`, `pdfplumber`, `Pillow` + +Скрипт [`run.sh`](run.sh) сам создаёт `.venv` и ставит зависимости. + +## Быстрый старт + +```bash +chmod +x run.sh +./run.sh # по умолчанию RealTek-r8169.pdf +./run.sh path/to/doc.pdf # свой PDF +./run.sh doc.pdf -o out/my-doc # свой каталог вывода +``` + +Структура вывода `out//` (по умолчанию): + +``` +out/RealTek-r8169/ +├── INDEX.md # оглавление со ссылками на главы +├── chapters/ # одна глава = один файл +│ ├── 01-features.md +│ └── ... +└── assets/ # embedded PNG + page PNG для сломанных layout-страниц +``` + +С флагом `--single-file` вместо `INDEX.md` и `chapters/` пишется один `{stem}.md` (каталог `assets/` всё равно создаётся). + +### Опции extract.py + +| Флаг | Описание | +|------|----------| +| `--single-file` | Один `{stem}.md` вместо `INDEX.md` + `chapters/` | +| `--tables` | Включить извлечение таблиц pdfplumber (по умолчанию выключено) | +| `--page-dpi N` | DPI для PNG-рендера страниц с битым текстом (default 150) | +| `-o DIR` | Каталог вывода | + +## Grep для агента + +Главы разбиты по крупным разделам документа. Каждая страница помечена HTML-комментарием: + +```bash +# найти упоминание регистра в главе 6 +rg -n 'DTCCR' out/RealTek-r8169/chapters/06-register-descriptions.md + +# все вхождения по всему документу +rg -n 'Interrupt Mask' out/RealTek-r8169/ + +# контекст вокруг страницы 51 +rg -n -C3 'page 51' out/RealTek-r8169/chapters/09-functional-description.md +``` + +Начните с [`INDEX.md`](out/RealTek-r8169/INDEX.md) — там ссылки на все главы. + +## Ограничения (model-free) + +Этот инструмент **не использует LLM/VLM**. Он комбинирует: + +- PyMuPDF — текст страниц и embedded-изображения +- pdfplumber — опциональные таблицы (`--tables`) +- эвристику «сломанной страницы» — PNG-рендер bitfield/diagram layout + +**Что хорошо:** быстро, локально, детерминированно; grep по главам; картинки сохраняются рядом. + +**Что хуже, чем Marker/Docling:** сложная вёрстка, multi-column, сканы без OCR, семантическая структура таблиц. Для таких PDF нужны модельные конвертеры или OCR. + +## Roadmap + +См. [`plan.md`](plan.md). diff --git a/extract.py b/extract.py new file mode 100644 index 0000000..417b45c --- /dev/null +++ b/extract.py @@ -0,0 +1,366 @@ +#!/usr/bin/env python3 +"""Model-free PDF → markdown chapters for agent grep (text + images + tables).""" + +from __future__ import annotations + +import argparse +import re +import shutil +from pathlib import Path + +import fitz +import pdfplumber + +# Repeated header/footer on most pages of this spec family +HEADER_FOOTER = re.compile( + r"^(RTL8169|2002/\d{2}/\d{2}|Rev\.\d+\.\d+|\d{1,3})$", + re.MULTILINE, +) +MAJOR_SECTION = re.compile(r"^(\d{1,2})\.\s+([A-Z][^\n]{2,})$") +TOC_LINE = re.compile(r"\.{4,}\s*\d+\s*$") +MIN_TABLE_ROWS = 2 +MIN_TABLE_CELLS = 6 +MIN_CELL_LEN_FOR_DEDUP = 4 +SPACED_LETTERS = re.compile(r"^([A-Z](?: [A-Z]){1,}|[A-Z])$") +DEFAULT_PAGE_DPI = 150 + + +def slugify(title: str) -> str: + s = re.sub(r"[^\w\s-]", "", title, flags=re.UNICODE) + s = re.sub(r"[-\s]+", "-", s.strip()).strip("-").lower() + return s[:80] or "section" + + +def clean_page_text(raw: str) -> str: + lines = [] + for line in raw.splitlines(): + t = line.strip() + if not t: + lines.append("") + continue + if HEADER_FOOTER.match(t): + continue + lines.append(line.rstrip()) + text = "\n".join(lines) + text = re.sub(r"\n{3,}", "\n\n", text) + return text.strip() + + +def is_toc_page(text: str) -> bool: + lines = [ln.strip() for ln in text.splitlines() if ln.strip()] + if not lines: + return False + dotted = sum(1 for ln in lines if TOC_LINE.search(ln)) + return dotted >= max(3, len(lines) // 4) + + +def _max_vertical_letter_run(text: str) -> int: + run = best = 0 + for line in text.splitlines(): + token = line.strip() + if len(token) == 1 and token.isalpha(): + run += 1 + best = max(best, run) + else: + run = 0 + return best + + +def is_broken_page(text: str) -> bool: + """Detect layout-heavy pages where get_text() yields vertical single letters.""" + lines = [ln.strip() for ln in text.splitlines() if ln.strip()] + if len(lines) < 12: + return False + if _max_vertical_letter_run(text) >= 4: + return True + single = sum(1 for ln in lines if len(ln) == 1) + short = sum(1 for ln in lines if len(ln) <= 2) + normal = sum(1 for ln in lines if len(ln) >= 10 or (" " in ln and len(ln) >= 5)) + # Bitfield/diagram pages: many one-letter lines stacked vertically + if single >= 20: + return True + if single >= 12 and single > normal * 0.45: + return True + # Mostly fragmented tokens, little readable prose + if short >= 25 and short > normal and short / len(lines) >= 0.35: + return True + return False + + +def is_layout_table(rows: list[list]) -> bool: + """Reject bitfield/diagram tables made of spaced letters and empty cells.""" + cells = [(c or "").strip() for row in rows for c in row] + non_empty = [c for c in cells if c] + if not non_empty: + return True + if len(rows) < MIN_TABLE_ROWS: + return True + filled = len(non_empty) + if filled < MIN_TABLE_CELLS: + return True + short = sum(1 for c in non_empty if len(c) <= 3) + spaced = sum(1 for c in non_empty if SPACED_LETTERS.match(c)) + if short / filled > 0.55 or spaced / filled > 0.25: + return True + # Single-row "tables" from layout heuristics (one data row + header) + if len(rows) == 2 and filled <= 4: + return True + return False + + +def table_overlaps_page_text(rows: list[list], page_text: str) -> bool: + """Skip markdown table when its cell text is already on the page.""" + page_norm = re.sub(r"\s+", " ", page_text.lower()) + hits = 0 + checked = 0 + for row in rows: + for cell in row: + c = (cell or "").strip().replace("\n", " ") + if len(c) < MIN_CELL_LEN_FOR_DEDUP: + continue + checked += 1 + if c.lower() in page_norm: + hits += 1 + continue + # Long descriptions: match if a substantial prefix is present + if len(c) >= 40 and c[:40].lower() in page_norm: + hits += 1 + return checked >= 3 and hits / checked >= 0.5 + + +def table_to_markdown(table: list[list], page_text: str) -> str | None: + rows = [] + for row in table: + cells = [(c or "").strip().replace("\n", " ") for c in row] + if any(cells): + rows.append(cells) + if len(rows) < MIN_TABLE_ROWS: + return None + ncol = max(len(r) for r in rows) + if ncol < 2: + return None + if is_layout_table(rows): + return None + if table_overlaps_page_text(rows, page_text): + return None + filled = sum(1 for r in rows for c in r if c) + if filled < MIN_TABLE_CELLS: + return None + width = ncol + norm = [r + [""] * (width - len(r)) for r in rows] + header, *body = norm + if not any(header): + header = [f"col{i+1}" for i in range(width)] + body = norm + lines = [ + "| " + " | ".join(header) + " |", + "| " + " | ".join("---" for _ in range(width)) + " |", + ] + for row in body: + lines.append("| " + " | ".join(row) + " |") + return "\n".join(lines) + + +def extract_images(doc: fitz.Document, page_index: int, assets_dir: Path, prefix: str) -> list[str]: + paths: list[str] = [] + page = doc[page_index] + for img_index, img in enumerate(page.get_images(full=True)): + xref = img[0] + try: + info = doc.extract_image(xref) + except Exception: + continue + ext = info.get("ext", "png") + name = f"{prefix}-p{page_index + 1:03d}-img{img_index + 1}.{ext}" + out = assets_dir / name + out.write_bytes(info["image"]) + paths.append(f"assets/{name}") + return paths + + +def render_page_png( + doc: fitz.Document, page_index: int, assets_dir: Path, prefix: str, dpi: int +) -> str: + page = doc[page_index] + zoom = dpi / 72 + pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom)) + name = f"{prefix}-p{page_index + 1:03d}-page.png" + out = assets_dir / name + pix.save(str(out)) + return f"assets/{name}" + + +def find_major_section(line: str) -> tuple[str, str] | None: + m = MAJOR_SECTION.match(line.strip()) + if not m: + return None + num, title = m.group(1), m.group(2).strip() + if TOC_LINE.search(title) or "." * 3 in title: + return None + if title.endswith(":"): + return None + if len(title) > 70: + return None + return num, title + + +def should_start_chapter(num: str, current: dict | None) -> bool: + if current is None: + return True + if current.get("num") == "00": + return True + try: + return int(num) > int(current["num"]) + except ValueError: + return False + + +def convert( + pdf_path: Path, + out_dir: Path, + *, + extract_tables: bool = False, + page_dpi: int = DEFAULT_PAGE_DPI, + single_file: bool = False, +) -> None: + if out_dir.exists(): + shutil.rmtree(out_dir) + assets_dir = out_dir / "assets" + chapters_dir = out_dir / "chapters" + assets_dir.mkdir(parents=True) + if not single_file: + chapters_dir.mkdir(parents=True) + + doc = fitz.open(pdf_path) + stem = pdf_path.stem + + sections: list[dict] = [] + current: dict | None = None + page_png_count = 0 + + with pdfplumber.open(pdf_path) as plumber: + for page_index in range(doc.page_count): + raw = doc[page_index].get_text() + text = clean_page_text(raw) + if is_toc_page(text): + continue + + img_refs = extract_images(doc, page_index, assets_dir, stem) + page_png_ref: str | None = None + if is_broken_page(text): + page_png_ref = render_page_png(doc, page_index, assets_dir, stem, page_dpi) + page_png_count += 1 + text = "" + + tables_md: list[str] = [] + if extract_tables: + page = plumber.pages[page_index] + for table in page.extract_tables() or []: + md = table_to_markdown(table, text or raw) + if md: + tables_md.append(md) + + page_block = [f""] + if text: + page_block.append(text) + if page_png_ref: + page_block.append(f"![page {page_index + 1} layout]({page_png_ref})") + if tables_md: + page_block.append("\n\n".join(tables_md)) + if img_refs: + page_block.append("\n".join(f"![figure]({p})" for p in img_refs)) + page_content = "\n\n".join(x for x in page_block if x) + + section_hit = None + source = text or raw + for line in source.splitlines(): + hit = find_major_section(line) + if hit: + section_hit = hit + + if section_hit and should_start_chapter(section_hit[0], current): + num, title = section_hit + if current: + sections.append(current) + current = { + "num": num, + "title": title, + "slug": f"{int(num):02d}-{slugify(title)}", + "pages": [], + } + + if current is None: + current = { + "num": "00", + "title": "front-matter", + "slug": "00-front-matter", + "pages": [], + } + current["pages"].append(page_content) + + if current: + sections.append(current) + doc.close() + + if single_file: + full_parts = [] + for sec in sections: + full_parts.append(f"# {sec['num']}. {sec['title']}\n\n") + full_parts.append("\n\n---\n\n".join(sec["pages"])) + full_parts.append("\n\n") + (out_dir / f"{stem}.md").write_text("".join(full_parts), encoding="utf-8") + else: + index_lines = [f"# {stem}\n", f"Source: `{pdf_path.name}`\n", "## Chapters\n"] + for sec in sections: + fname = f"{sec['slug']}.md" + body = "\n\n---\n\n".join(sec["pages"]) + chapter_path = chapters_dir / fname + header = f"# {sec['num']}. {sec['title']}\n\n" + chapter_path.write_text(header + body + "\n", encoding="utf-8") + index_lines.append(f"- [{sec['num']}. {sec['title']}](chapters/{fname})\n") + (out_dir / "INDEX.md").write_text("".join(index_lines), encoding="utf-8") + + print(f"Wrote {out_dir}") + if single_file: + print(f" full md: {out_dir / f'{stem}.md'}") + else: + print(f" chapters: {len(sections)}") + print(f" index: {out_dir / 'INDEX.md'}") + print(f" images: {len(list(assets_dir.glob('*')))}") + print(f" page png: {page_png_count}") + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("pdf", type=Path) + parser.add_argument("-o", "--output", type=Path, default=None) + parser.add_argument( + "--tables", + action="store_true", + help="extract pdfplumber tables into markdown (off by default)", + ) + parser.add_argument( + "--page-dpi", + type=int, + default=DEFAULT_PAGE_DPI, + metavar="N", + help=f"DPI for full-page PNG fallback on broken layout pages (default {DEFAULT_PAGE_DPI})", + ) + parser.add_argument( + "--single-file", + action="store_true", + help="write one {stem}.md instead of INDEX.md + chapters/", + ) + args = parser.parse_args() + out = args.output or Path("out") / args.pdf.stem + convert( + args.pdf.resolve(), + out.resolve(), + extract_tables=args.tables, + page_dpi=args.page_dpi, + single_file=args.single_file, + ) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c22bda3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +pymupdf>=1.24.0 +pdfplumber>=0.11.0 +Pillow>=10.0.0 diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..6918e73 --- /dev/null +++ b/run.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Create/use .venv, install deps, run extract.py. Usage: ./run.sh [pdf] [-o out/dir] +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$ROOT" + +VENV="$ROOT/.venv" +PY="$VENV/bin/python" +PIP="$VENV/bin/pip" + +if [[ ! -d "$VENV" ]]; then + echo "Creating venv at $VENV ..." + python3 -m venv "$VENV" +fi + +echo "Installing dependencies ..." +"$PIP" install -q -r "$ROOT/requirements.txt" + +if [[ $# -eq 0 ]]; then + set -- "$ROOT/RealTek-r8169.pdf" +fi + +echo "Running extract.py $*" +exec "$PY" "$ROOT/extract.py" "$@"