init

2026-05-31 16:16:51 +07:00
commit 41823dcbf8
4 changed files with 468 additions and 0 deletions
@@ -0,0 +1,74 @@
 # pdf2ai
 Model-free конвертер PDF → markdown для локального агента: главы по разделам, картинки рядом, grep по `<!-- page N -->`.
 ## Зависимости
 - Python **3.10+**
 - pip-пакеты из [`requirements.txt`](requirements.txt): `pymupdf`, `pdfplumber`, `Pillow`
 Скрипт [`run.sh`](run.sh) сам создаёт `.venv` и ставит зависимости.
 ## Быстрый старт
 ```bash
 chmod +x run.sh
 ./run.sh                          # по умолчанию RealTek-r8169.pdf
 ./run.sh path/to/doc.pdf          # свой PDF
 ./run.sh doc.pdf -o out/my-doc    # свой каталог вывода
 ```
 Структура вывода `out/<stem>/` (по умолчанию):
 ```
 out/RealTek-r8169/
 ├── INDEX.md              # оглавление со ссылками на главы
 ├── chapters/             # одна глава = один файл
 │   ├── 01-features.md
 │   └── ...
 └── assets/               # embedded PNG + page PNG для сломанных layout-страниц
 ```
 С флагом `--single-file` вместо `INDEX.md` и `chapters/` пишется один `{stem}.md` (каталог `assets/` всё равно создаётся).
 ### Опции extract.py
 | Флаг | Описание |
 |------|----------|
 | `--single-file` | Один `{stem}.md` вместо `INDEX.md` + `chapters/` |
 | `--tables` | Включить извлечение таблиц pdfplumber (по умолчанию выключено) |
 | `--page-dpi N` | DPI для PNG-рендера страниц с битым текстом (default 150) |
 | `-o DIR` | Каталог вывода |
 ## Grep для агента
 Главы разбиты по крупным разделам документа. Каждая страница помечена HTML-комментарием:
 ```bash
 # найти упоминание регистра в главе 6
 rg -n 'DTCCR' out/RealTek-r8169/chapters/06-register-descriptions.md
 # все вхождения по всему документу
 rg -n 'Interrupt Mask' out/RealTek-r8169/
 # контекст вокруг страницы 51
 rg -n -C3 'page 51' out/RealTek-r8169/chapters/09-functional-description.md
 ```
 Начните с [`INDEX.md`](out/RealTek-r8169/INDEX.md) — там ссылки на все главы.
 ## Ограничения (model-free)
 Этот инструмент **не использует LLM/VLM**. Он комбинирует:
 - PyMuPDF — текст страниц и embedded-изображения
 - pdfplumber — опциональные таблицы (`--tables`)
 - эвристику «сломанной страницы» — PNG-рендер bitfield/diagram layout
 **Что хорошо:** быстро, локально, детерминированно; grep по главам; картинки сохраняются рядом.
 **Что хуже, чем Marker/Docling:** сложная вёрстка, multi-column, сканы без OCR, семантическая структура таблиц. Для таких PDF нужны модельные конвертеры или OCR.
 ## Roadmap
 См. [`plan.md`](plan.md).
@@ -0,0 +1,366 @@
 #!/usr/bin/env python3
 """Model-free PDF → markdown chapters for agent grep (text + images + tables)."""
 from __future__ import annotations
 import argparse
 import re
 import shutil
 from pathlib import Path
 import fitz
 import pdfplumber
 # Repeated header/footer on most pages of this spec family
 HEADER_FOOTER = re.compile(
    r"^(RTL8169|2002/\d{2}/\d{2}|Rev\.\d+\.\d+|\d{1,3})$",
    re.MULTILINE,
 )
 MAJOR_SECTION = re.compile(r"^(\d{1,2})\.\s+([A-Z][^\n]{2,})$")
 TOC_LINE = re.compile(r"\.{4,}\s*\d+\s*$")
 MIN_TABLE_ROWS = 2
 MIN_TABLE_CELLS = 6
 MIN_CELL_LEN_FOR_DEDUP = 4
 SPACED_LETTERS = re.compile(r"^([A-Z](?: [A-Z]){1,}|[A-Z])$")
 DEFAULT_PAGE_DPI = 150
 def slugify(title: str) -> str:
    s = re.sub(r"[^\w\s-]", "", title, flags=re.UNICODE)
    s = re.sub(r"[-\s]+", "-", s.strip()).strip("-").lower()
    return s[:80] or "section"
 def clean_page_text(raw: str) -> str:
    lines = []
    for line in raw.splitlines():
        t = line.strip()
        if not t:
            lines.append("")
            continue
        if HEADER_FOOTER.match(t):
            continue
        lines.append(line.rstrip())
    text = "\n".join(lines)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()
 def is_toc_page(text: str) -> bool:
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    if not lines:
        return False
    dotted = sum(1 for ln in lines if TOC_LINE.search(ln))
    return dotted >= max(3, len(lines) // 4)
 def _max_vertical_letter_run(text: str) -> int:
    run = best = 0
    for line in text.splitlines():
        token = line.strip()
        if len(token) == 1 and token.isalpha():
            run += 1
            best = max(best, run)
        else:
            run = 0
    return best
 def is_broken_page(text: str) -> bool:
    """Detect layout-heavy pages where get_text() yields vertical single letters."""
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    if len(lines) < 12:
        return False
    if _max_vertical_letter_run(text) >= 4:
        return True
    single = sum(1 for ln in lines if len(ln) == 1)
    short = sum(1 for ln in lines if len(ln) <= 2)
    normal = sum(1 for ln in lines if len(ln) >= 10 or (" " in ln and len(ln) >= 5))
    # Bitfield/diagram pages: many one-letter lines stacked vertically
    if single >= 20:
        return True
    if single >= 12 and single > normal * 0.45:
        return True
    # Mostly fragmented tokens, little readable prose
    if short >= 25 and short > normal and short / len(lines) >= 0.35:
        return True
    return False
 def is_layout_table(rows: list[list]) -> bool:
    """Reject bitfield/diagram tables made of spaced letters and empty cells."""
    cells = [(c or "").strip() for row in rows for c in row]
    non_empty = [c for c in cells if c]
    if not non_empty:
        return True
    if len(rows) < MIN_TABLE_ROWS:
        return True
    filled = len(non_empty)
    if filled < MIN_TABLE_CELLS:
        return True
    short = sum(1 for c in non_empty if len(c) <= 3)
    spaced = sum(1 for c in non_empty if SPACED_LETTERS.match(c))
    if short / filled > 0.55 or spaced / filled > 0.25:
        return True
    # Single-row "tables" from layout heuristics (one data row + header)
    if len(rows) == 2 and filled <= 4:
        return True
    return False
 def table_overlaps_page_text(rows: list[list], page_text: str) -> bool:
    """Skip markdown table when its cell text is already on the page."""
    page_norm = re.sub(r"\s+", " ", page_text.lower())
    hits = 0
    checked = 0
    for row in rows:
        for cell in row:
            c = (cell or "").strip().replace("\n", " ")
            if len(c) < MIN_CELL_LEN_FOR_DEDUP:
                continue
            checked += 1
            if c.lower() in page_norm:
                hits += 1
                continue
            # Long descriptions: match if a substantial prefix is present
            if len(c) >= 40 and c[:40].lower() in page_norm:
                hits += 1
    return checked >= 3 and hits / checked >= 0.5
 def table_to_markdown(table: list[list], page_text: str) -> str | None:
    rows = []
    for row in table:
        cells = [(c or "").strip().replace("\n", " ") for c in row]
        if any(cells):
            rows.append(cells)
    if len(rows) < MIN_TABLE_ROWS:
        return None
    ncol = max(len(r) for r in rows)
    if ncol < 2:
        return None
    if is_layout_table(rows):
        return None
    if table_overlaps_page_text(rows, page_text):
        return None
    filled = sum(1 for r in rows for c in r if c)
    if filled < MIN_TABLE_CELLS:
        return None
    width = ncol
    norm = [r + [""] * (width - len(r)) for r in rows]
    header, *body = norm
    if not any(header):
        header = [f"col{i+1}" for i in range(width)]
        body = norm
    lines = [
        "| " + " | ".join(header) + " |",
        "| " + " | ".join("---" for _ in range(width)) + " |",
    ]
    for row in body:
        lines.append("| " + " | ".join(row) + " |")
    return "\n".join(lines)
 def extract_images(doc: fitz.Document, page_index: int, assets_dir: Path, prefix: str) -> list[str]:
    paths: list[str] = []
    page = doc[page_index]
    for img_index, img in enumerate(page.get_images(full=True)):
        xref = img[0]
        try:
            info = doc.extract_image(xref)
        except Exception:
            continue
        ext = info.get("ext", "png")
        name = f"{prefix}-p{page_index + 1:03d}-img{img_index + 1}.{ext}"
        out = assets_dir / name
        out.write_bytes(info["image"])
        paths.append(f"assets/{name}")
    return paths
 def render_page_png(
    doc: fitz.Document, page_index: int, assets_dir: Path, prefix: str, dpi: int
 ) -> str:
    page = doc[page_index]
    zoom = dpi / 72
    pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
    name = f"{prefix}-p{page_index + 1:03d}-page.png"
    out = assets_dir / name
    pix.save(str(out))
    return f"assets/{name}"
 def find_major_section(line: str) -> tuple[str, str] | None:
    m = MAJOR_SECTION.match(line.strip())
    if not m:
        return None
    num, title = m.group(1), m.group(2).strip()
    if TOC_LINE.search(title) or "." * 3 in title:
        return None
    if title.endswith(":"):
        return None
    if len(title) > 70:
        return None
    return num, title
 def should_start_chapter(num: str, current: dict | None) -> bool:
    if current is None:
        return True
    if current.get("num") == "00":
        return True
    try:
        return int(num) > int(current["num"])
    except ValueError:
        return False
 def convert(
    pdf_path: Path,
    out_dir: Path,
    *,
    extract_tables: bool = False,
    page_dpi: int = DEFAULT_PAGE_DPI,
    single_file: bool = False,
 ) -> None:
    if out_dir.exists():
        shutil.rmtree(out_dir)
    assets_dir = out_dir / "assets"
    chapters_dir = out_dir / "chapters"
    assets_dir.mkdir(parents=True)
    if not single_file:
        chapters_dir.mkdir(parents=True)
    doc = fitz.open(pdf_path)
    stem = pdf_path.stem
    sections: list[dict] = []
    current: dict | None = None
    page_png_count = 0
    with pdfplumber.open(pdf_path) as plumber:
        for page_index in range(doc.page_count):
            raw = doc[page_index].get_text()
            text = clean_page_text(raw)
            if is_toc_page(text):
                continue
            img_refs = extract_images(doc, page_index, assets_dir, stem)
            page_png_ref: str | None = None
            if is_broken_page(text):
                page_png_ref = render_page_png(doc, page_index, assets_dir, stem, page_dpi)
                page_png_count += 1
                text = ""
            tables_md: list[str] = []
            if extract_tables:
                page = plumber.pages[page_index]
                for table in page.extract_tables() or []:
                    md = table_to_markdown(table, text or raw)
                    if md:
                        tables_md.append(md)
            page_block = [f"<!-- page {page_index + 1} -->"]
            if text:
                page_block.append(text)
            if page_png_ref:
                page_block.append(f"![page {page_index + 1} layout]({page_png_ref})")
            if tables_md:
                page_block.append("\n\n".join(tables_md))
            if img_refs:
                page_block.append("\n".join(f"![figure]({p})" for p in img_refs))
            page_content = "\n\n".join(x for x in page_block if x)
            section_hit = None
            source = text or raw
            for line in source.splitlines():
                hit = find_major_section(line)
                if hit:
                    section_hit = hit
            if section_hit and should_start_chapter(section_hit[0], current):
                num, title = section_hit
                if current:
                    sections.append(current)
                current = {
                    "num": num,
                    "title": title,
                    "slug": f"{int(num):02d}-{slugify(title)}",
                    "pages": [],
                }
            if current is None:
                current = {
                    "num": "00",
                    "title": "front-matter",
                    "slug": "00-front-matter",
                    "pages": [],
                }
            current["pages"].append(page_content)
    if current:
        sections.append(current)
    doc.close()
    if single_file:
        full_parts = []
        for sec in sections:
            full_parts.append(f"# {sec['num']}. {sec['title']}\n\n")
            full_parts.append("\n\n---\n\n".join(sec["pages"]))
            full_parts.append("\n\n")
        (out_dir / f"{stem}.md").write_text("".join(full_parts), encoding="utf-8")
    else:
        index_lines = [f"# {stem}\n", f"Source: `{pdf_path.name}`\n", "## Chapters\n"]
        for sec in sections:
            fname = f"{sec['slug']}.md"
            body = "\n\n---\n\n".join(sec["pages"])
            chapter_path = chapters_dir / fname
            header = f"# {sec['num']}. {sec['title']}\n\n"
            chapter_path.write_text(header + body + "\n", encoding="utf-8")
            index_lines.append(f"- [{sec['num']}. {sec['title']}](chapters/{fname})\n")
        (out_dir / "INDEX.md").write_text("".join(index_lines), encoding="utf-8")
    print(f"Wrote {out_dir}")
    if single_file:
        print(f"  full md:  {out_dir / f'{stem}.md'}")
    else:
        print(f"  chapters: {len(sections)}")
        print(f"  index:    {out_dir / 'INDEX.md'}")
    print(f"  images:   {len(list(assets_dir.glob('*')))}")
    print(f"  page png: {page_png_count}")
 def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("pdf", type=Path)
    parser.add_argument("-o", "--output", type=Path, default=None)
    parser.add_argument(
        "--tables",
        action="store_true",
        help="extract pdfplumber tables into markdown (off by default)",
    )
    parser.add_argument(
        "--page-dpi",
        type=int,
        default=DEFAULT_PAGE_DPI,
        metavar="N",
        help=f"DPI for full-page PNG fallback on broken layout pages (default {DEFAULT_PAGE_DPI})",
    )
    parser.add_argument(
        "--single-file",
        action="store_true",
        help="write one {stem}.md instead of INDEX.md + chapters/",
    )
    args = parser.parse_args()
    out = args.output or Path("out") / args.pdf.stem
    convert(
        args.pdf.resolve(),
        out.resolve(),
        extract_tables=args.tables,
        page_dpi=args.page_dpi,
        single_file=args.single_file,
    )
 if __name__ == "__main__":
    main()
@@ -0,0 +1,3 @@
 pymupdf>=1.24.0
 pdfplumber>=0.11.0
 Pillow>=10.0.0
@@ -0,0 +1,25 @@
 #!/usr/bin/env bash
 # Create/use .venv, install deps, run extract.py. Usage: ./run.sh [pdf] [-o out/dir]
 set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$ROOT"
 VENV="$ROOT/.venv"
 PY="$VENV/bin/python"
 PIP="$VENV/bin/pip"
 if [[ ! -d "$VENV" ]]; then
  echo "Creating venv at $VENV ..."
  python3 -m venv "$VENV"
 fi
 echo "Installing dependencies ..."
 "$PIP" install -q -r "$ROOT/requirements.txt"
 if [[ $# -eq 0 ]]; then
  set -- "$ROOT/RealTek-r8169.pdf"
 fi
 echo "Running extract.py $*"
 exec "$PY" "$ROOT/extract.py" "$@"