pdf_sanatizer/pdf_sanatizer.py

"""
enhanced_sanitize_pdf.py
========================

This utility removes SANS‑style watermark overlays (names, e‑mails, hashes,
dates and licence lines) from a PDF without leaving selection boxes or
visible artefacts.  It can open password‑protected files by prompting the
user for a password and will automatically write the cleaned document
alongside the original with ``_clean`` appended to its file name.

In addition to the basic cleaning implemented in ``sanitize_pdf_watermark.py``,
this version includes two important improvements:

* **Password handling:** If the input PDF is encrypted, the script will ask
  for a password using ``input()`` and attempt to authenticate before
  continuing.  Should the password prove wrong, the script aborts with a
  helpful message.  When processing unencrypted files, no prompt will be
  shown.

* **Robust saving:**  Certain PDFs have corrupted object streams or other
  structural issues that prevent PyMuPDF from saving them directly.  If a
  ``ValueError`` or ``RuntimeError`` is raised during ``doc.save()``, the
  script falls back to creating a repaired copy in memory using
  ``doc.tobytes(garbage=4, deflate=True)``.  It then opens this repaired
  content as a new document and saves it to disk.  This technique is
  recommended by the PyMuPDF documentation for recovering from broken
  object streams【206307526196463†L135-L154】.

Usage::

    python enhanced_sanitize_pdf.py path/to/input.pdf

The cleaned PDF will be created in the same directory as ``input.pdf`` with
``_clean`` appended before the ``.pdf`` extension.  The script prints the
number of watermark segments removed.

This script requires the PyMuPDF (``fitz``) package.
"""

import argparse
import sys
import os
import re
import getpass

import fitz  # type: ignore[import]


def open_with_password(path: str, max_attempts: int = 3) -> fitz.Document:
    """Open ``path`` and prompt for a password when the PDF is encrypted.

    PyMuPDF happily opens encrypted PDFs without immediately failing, but it
    marks them as needing authentication via ``doc.needs_pass``.  The original
    implementation relied on ``fitz.open`` raising a ``RuntimeError`` to detect
    encryption, which meant passwords were never requested and the caller
    would hit authorization errors later.  We now inspect ``doc.needs_pass``
    directly and allow up to ``max_attempts`` attempts at entering the password
    before aborting.
    """

    try:
        doc = fitz.open(path)
    except RuntimeError as exc:
        raise RuntimeError(f"Unable to open '{path}': {exc}") from exc

    if not doc.needs_pass:
        return doc

    print(f"The document '{path}' is encrypted.")
    for attempt in range(1, max_attempts + 1):
        password = getpass.getpass("Please enter the password: ")
        if doc.authenticate(password):
            return doc
        print("Incorrect password. Try again." if attempt < max_attempts else "Incorrect password.")

    doc.close()
    raise ValueError("Failed to authenticate after multiple attempts. Unable to open the PDF.")


def remove_watermark_segments(doc: fitz.Document) -> int:
    """Remove known watermark text blocks from every page of the document.

    Watermark segments are identified by unusual font sizes and text
    rendering modes used exclusively by the SANS licence overlays:

    * Invisible 36‑point text with rendering mode 1 or 3 (hidden names,
      e‑mails, numeric codes and hashes).
    * Rotated overlays drawn with a –25 point font.
    * Footer lines drawn with 10‑ or 18‑point fonts containing the
      licence owner’s information.

    The patterns below use regular expressions to locate complete
    ``BT … ET`` text objects matching these criteria and delete them.

    Returns
    -------
    int
        The total number of removed segments across all pages.
    """
    # Compile regular expressions once for efficiency
    patterns = [
        # Invisible 36‑point text with rendering mode 1 or 3
        re.compile(r"BT[^B]*?/F\d+\s+36\s+Tf[^E]*?\s+(?:1|3)\s+Tr[^E]*?ET", re.S),
        # Diagonal overlay drawn with a –25 point font
        re.compile(r"BT[^B]*?/F\d+\s+-25\s+Tf[^E]*?ET", re.S),
        # Horizontal footers drawn with 10‑point fonts
        re.compile(r"BT[^B]*?/F\d+\s+10\s+Tf[^E]*?ET", re.S),
        # Horizontal footers drawn with 18‑point fonts
        re.compile(r"BT[^B]*?/F\d+\s+18\s+Tf[^E]*?ET", re.S),
    ]
    removed_count = 0
    for page in doc:
        # Iterate over all content streams referenced by this page
        for xref in page.get_contents():
            stream = doc.xref_stream(xref).decode("latin1")
            modified = stream
            for pat in patterns:
                modified, count = pat.subn("", modified)
                removed_count += count
            if modified != stream:
                # Update the stream in place
                doc.update_stream(xref, modified.encode("latin1"))
    return removed_count


def build_output_path(input_path: str) -> str:
    """Derive the output filename by inserting ``_clean`` before the extension."""
    directory, filename = os.path.split(input_path)
    base, ext = os.path.splitext(filename)
    return os.path.join(directory, f"{base}_clean{ext}")


def save_document(doc: fitz.Document, out_path: str) -> None:
    """Save the document to ``out_path`` with a robust fallback.

    PyMuPDF may raise errors when saving PDFs with damaged object streams or
    encryption.  In such cases, attempt to repair the document by
    exporting its bytes and reopening it as a new document, then save
    again.  If the fallback also fails, the exception is propagated.
    """
    try:
        doc.save(out_path, garbage=4, deflate=True)
    except (ValueError, RuntimeError) as exc:
        # Attempt to repair the document by exporting and reopening
        print(f"Encountered an error while saving: {exc}. Attempting repair …")
        # Export the document to bytes, performing garbage collection and
        # decompression.  This will also decrypt the file if a password
        # was used【206307526196463†L135-L154】.
        buffer = doc.tobytes(garbage=4, deflate=True)
        repaired = fitz.open("pdf", buffer)
        repaired.save(out_path, garbage=4, deflate=True)
        repaired.close()


def sanitize_pdf(path: str) -> None:
    """Perform watermark removal on ``path`` and write a cleaned copy."""
    doc = open_with_password(path)
    removed = remove_watermark_segments(doc)
    out_path = build_output_path(path)
    save_document(doc, out_path)
    doc.close()
    print(f"Removed {removed} watermark segment(s). Cleaned PDF saved as '{out_path}'.")


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="Remove SANS-style watermark overlays from the provided PDF. "
        "Encrypted PDFs are supported and will trigger a password prompt."
    )
    parser.add_argument(
        "input_pdf",
        metavar="INPUT.pdf",
        help="Path to the PDF you want sanitized."
    )
    return parser


def main(argv: list[str]) -> None:
    parser = build_parser()
    args = parser.parse_args(argv[1:])
    sanitize_pdf(args.input_pdf)


if __name__ == "__main__":
    main(sys.argv)