From bd42e2f7e3e926784632dc7117cb0dee54530dbb Mon Sep 17 00:00:00 2001 From: tobias Date: Sun, 16 Nov 2025 22:30:36 +0100 Subject: [PATCH] first version --- README.md | 35 +++++++++++ pdf_sanatizer.py | 160 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 8 +++ 3 files changed, 203 insertions(+) create mode 100644 pdf_sanatizer.py create mode 100644 requirements.txt diff --git a/README.md b/README.md index e69de29..15fc11a 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,35 @@ +# SANS Courseware Watermark Remover + +Removes personalized SANS “Licensed To …” watermarks (names, emails, hashes, dates) by deleting the actual `BT…ET` text objects in the PDF—no white boxes, no selection artefacts. + +## Quickstart + +1. **(If encrypted) Decrypt first with `qpdf`:** + + ```bash + qpdf --password='' --decrypt INPUT.pdf INPUT_unlocked.pdf + ``` + +2. **Install dependencies (Python 3.9+):** + + ```bash + python -m venv .venv + source .venv/bin/activate # Windows: .venv\Scripts\activate + pip install -r requirements.txt + ``` + +3. **Run the sanitizer (auto-creates `_clean.pdf`):** +4. + ```bash + # Recommended latest script: + python enhanced_sanitize_pdf.py INPUT_unlocked.pdf + ``` + +## Notes + +* The tool targets common SANS watermark patterns: + * Invisible 36-pt text with rendering modes 1/3, + * Rotated diagonal overlay using −25-pt fonts, + * Footer lines at 10/18-pt. +* If your course PDFs use different fonts/sizes, adjust the regex patterns inside the script. + diff --git a/pdf_sanatizer.py b/pdf_sanatizer.py new file mode 100644 index 0000000..6f0b298 --- /dev/null +++ b/pdf_sanatizer.py @@ -0,0 +1,160 @@ +""" +enhanced_sanitize_pdf.py +======================== + +This utility removes SANS‑style watermark overlays (names, e‑mails, hashes, +dates and licence lines) from a PDF without leaving selection boxes or +visible artefacts. It can open password‑protected files by prompting the +user for a password and will automatically write the cleaned document +alongside the original with ``_clean`` appended to its file name. + +In addition to the basic cleaning implemented in ``sanitize_pdf_watermark.py``, +this version includes two important improvements: + +* **Password handling:** If the input PDF is encrypted, the script will ask + for a password using ``input()`` and attempt to authenticate before + continuing. Should the password prove wrong, the script aborts with a + helpful message. When processing unencrypted files, no prompt will be + shown. + +* **Robust saving:** Certain PDFs have corrupted object streams or other + structural issues that prevent PyMuPDF from saving them directly. If a + ``ValueError`` or ``RuntimeError`` is raised during ``doc.save()``, the + script falls back to creating a repaired copy in memory using + ``doc.tobytes(garbage=4, deflate=True)``. It then opens this repaired + content as a new document and saves it to disk. This technique is + recommended by the PyMuPDF documentation for recovering from broken + object streams【206307526196463†L135-L154】. + +Usage:: + + python enhanced_sanitize_pdf.py path/to/input.pdf + +The cleaned PDF will be created in the same directory as ``input.pdf`` with +``_clean`` appended before the ``.pdf`` extension. The script prints the +number of watermark segments removed. + +This script requires the PyMuPDF (``fitz``) package. +""" + +import sys +import os +import re +from typing import Tuple, Optional + +import fitz # type: ignore[import] + + +def open_with_password(path: str) -> fitz.Document: + """Open a PDF file, prompting the user for a password if necessary. + + If the file is encrypted, PyMuPDF raises a ``RuntimeError``. In that + case the user is prompted to enter the password. If authentication + fails, a ``ValueError`` is raised and the program aborts. + """ + try: + return fitz.open(path) + except RuntimeError: + # Document appears to be encrypted + print(f"The document '{path}' is encrypted.") + password = input("Please enter the password: ") + doc = fitz.open(path) + if not doc.authenticate(password): + raise ValueError("Incorrect password provided. Unable to open the PDF.") + return doc + + +def remove_watermark_segments(doc: fitz.Document) -> int: + """Remove known watermark text blocks from every page of the document. + + Watermark segments are identified by unusual font sizes and text + rendering modes used exclusively by the SANS licence overlays: + + * Invisible 36‑point text with rendering mode 1 or 3 (hidden names, + e‑mails, numeric codes and hashes). + * Rotated overlays drawn with a –25 point font. + * Footer lines drawn with 10‑ or 18‑point fonts containing the + licence owner’s information. + + The patterns below use regular expressions to locate complete + ``BT … ET`` text objects matching these criteria and delete them. + + Returns + ------- + int + The total number of removed segments across all pages. + """ + # Compile regular expressions once for efficiency + patterns = [ + # Invisible 36‑point text with rendering mode 1 or 3 + re.compile(r"BT[^B]*?/F\d+\s+36\s+Tf[^E]*?\s+(?:1|3)\s+Tr[^E]*?ET", re.S), + # Diagonal overlay drawn with a –25 point font + re.compile(r"BT[^B]*?/F\d+\s+-25\s+Tf[^E]*?ET", re.S), + # Horizontal footers drawn with 10‑point fonts + re.compile(r"BT[^B]*?/F\d+\s+10\s+Tf[^E]*?ET", re.S), + # Horizontal footers drawn with 18‑point fonts + re.compile(r"BT[^B]*?/F\d+\s+18\s+Tf[^E]*?ET", re.S), + ] + removed_count = 0 + for page in doc: + # Iterate over all content streams referenced by this page + for xref in page.get_contents(): + stream = doc.xref_stream(xref).decode("latin1") + modified = stream + for pat in patterns: + modified, count = pat.subn("", modified) + removed_count += count + if modified != stream: + # Update the stream in place + doc.update_stream(xref, modified.encode("latin1")) + return removed_count + + +def build_output_path(input_path: str) -> str: + """Derive the output filename by inserting ``_clean`` before the extension.""" + directory, filename = os.path.split(input_path) + base, ext = os.path.splitext(filename) + return os.path.join(directory, f"{base}_clean{ext}") + + +def save_document(doc: fitz.Document, out_path: str) -> None: + """Save the document to ``out_path`` with a robust fallback. + + PyMuPDF may raise errors when saving PDFs with damaged object streams or + encryption. In such cases, attempt to repair the document by + exporting its bytes and reopening it as a new document, then save + again. If the fallback also fails, the exception is propagated. + """ + try: + doc.save(out_path, garbage=4, deflate=True) + except (ValueError, RuntimeError) as exc: + # Attempt to repair the document by exporting and reopening + print(f"Encountered an error while saving: {exc}. Attempting repair …") + # Export the document to bytes, performing garbage collection and + # decompression. This will also decrypt the file if a password + # was used【206307526196463†L135-L154】. + buffer = doc.tobytes(garbage=4, deflate=True) + repaired = fitz.open("pdf", buffer) + repaired.save(out_path, garbage=4, deflate=True) + repaired.close() + + +def sanitize_pdf(path: str) -> None: + """Perform watermark removal on ``path`` and write a cleaned copy.""" + doc = open_with_password(path) + removed = remove_watermark_segments(doc) + out_path = build_output_path(path) + save_document(doc, out_path) + doc.close() + print(f"Removed {removed} watermark segment(s). Cleaned PDF saved as '{out_path}'.") + + +def main(argv: list[str]) -> None: + if len(argv) != 2: + print("Usage: python enhanced_sanitize_pdf.py input.pdf") + return + sanitize_pdf(argv[1]) + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4687504 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ + +# Requirements for the SANS watermark removal tool +# +# This tool requires the PyMuPDF package to manipulate PDF files. Install +# it using pip before running the script: +# pip install -r requirements.txt + +PyMuPDF>=1.23.0 \ No newline at end of file