first version

2025-11-16 22:30:36 +01:00
parent 3d3625c93a
commit bd42e2f7e3
3 changed files with 203 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,35 @@
+# SANS Courseware Watermark Remover
+
+Removes personalized SANS “Licensed To …” watermarks (names, emails, hashes, dates) by deleting the actual `BT…ET` text objects in the PDF—no white boxes, no selection artefacts.
+
+## Quickstart
+
+1. **(If encrypted) Decrypt first with `qpdf`:**
+   
+   ```bash
+   qpdf --password='<YOUR_PASSWORD>' --decrypt INPUT.pdf INPUT_unlocked.pdf
+   ```
+
+2. **Install dependencies (Python 3.9+):**
+
+   ```bash
+   python -m venv .venv
+   source .venv/bin/activate            # Windows: .venv\Scripts\activate
+   pip install -r requirements.txt
+   ```
+
+3. **Run the sanitizer (auto-creates `<input>_clean.pdf`):**
+4. 
+   ```bash
+   # Recommended latest script:
+   python enhanced_sanitize_pdf.py INPUT_unlocked.pdf
+   ```
+
+## Notes
+
+* The tool targets common SANS watermark patterns:
+  * Invisible 36-pt text with rendering modes 1/3,
+  * Rotated diagonal overlay using −25-pt fonts,
+  * Footer lines at 10/18-pt.
+* If your course PDFs use different fonts/sizes, adjust the regex patterns inside the script.
+
--- a/pdf_sanatizer.py
+++ b/pdf_sanatizer.py
@@ -0,0 +1,160 @@
+"""
+enhanced_sanitize_pdf.py
+========================
+
+This utility removes SANS‑style watermark overlays (names, e‑mails, hashes,
+dates and licence lines) from a PDF without leaving selection boxes or
+visible artefacts.  It can open password‑protected files by prompting the
+user for a password and will automatically write the cleaned document
+alongside the original with ``_clean`` appended to its file name.
+
+In addition to the basic cleaning implemented in ``sanitize_pdf_watermark.py``,
+this version includes two important improvements:
+
+* **Password handling:** If the input PDF is encrypted, the script will ask
+  for a password using ``input()`` and attempt to authenticate before
+  continuing.  Should the password prove wrong, the script aborts with a
+  helpful message.  When processing unencrypted files, no prompt will be
+  shown.
+
+* **Robust saving:**  Certain PDFs have corrupted object streams or other
+  structural issues that prevent PyMuPDF from saving them directly.  If a
+  ``ValueError`` or ``RuntimeError`` is raised during ``doc.save()``, the
+  script falls back to creating a repaired copy in memory using
+  ``doc.tobytes(garbage=4, deflate=True)``.  It then opens this repaired
+  content as a new document and saves it to disk.  This technique is
+  recommended by the PyMuPDF documentation for recovering from broken
+  object streams【206307526196463†L135-L154】.
+
+Usage::
+
+    python enhanced_sanitize_pdf.py path/to/input.pdf
+
+The cleaned PDF will be created in the same directory as ``input.pdf`` with
+``_clean`` appended before the ``.pdf`` extension.  The script prints the
+number of watermark segments removed.
+
+This script requires the PyMuPDF (``fitz``) package.
+"""
+
+import sys
+import os
+import re
+from typing import Tuple, Optional
+
+import fitz  # type: ignore[import]
+
+
+def open_with_password(path: str) -> fitz.Document:
+    """Open a PDF file, prompting the user for a password if necessary.
+
+    If the file is encrypted, PyMuPDF raises a ``RuntimeError``.  In that
+    case the user is prompted to enter the password.  If authentication
+    fails, a ``ValueError`` is raised and the program aborts.
+    """
+    try:
+        return fitz.open(path)
+    except RuntimeError:
+        # Document appears to be encrypted
+        print(f"The document '{path}' is encrypted.")
+        password = input("Please enter the password: ")
+        doc = fitz.open(path)
+        if not doc.authenticate(password):
+            raise ValueError("Incorrect password provided. Unable to open the PDF.")
+        return doc
+
+
+def remove_watermark_segments(doc: fitz.Document) -> int:
+    """Remove known watermark text blocks from every page of the document.
+
+    Watermark segments are identified by unusual font sizes and text
+    rendering modes used exclusively by the SANS licence overlays:
+
+    * Invisible 36‑point text with rendering mode 1 or 3 (hidden names,
+      e‑mails, numeric codes and hashes).
+    * Rotated overlays drawn with a –25 point font.
+    * Footer lines drawn with 10‑ or 18‑point fonts containing the
+      licence owner’s information.
+
+    The patterns below use regular expressions to locate complete
+    ``BT … ET`` text objects matching these criteria and delete them.
+
+    Returns
+    -------
+    int
+        The total number of removed segments across all pages.
+    """
+    # Compile regular expressions once for efficiency
+    patterns = [
+        # Invisible 36‑point text with rendering mode 1 or 3
+        re.compile(r"BT[^B]*?/F\d+\s+36\s+Tf[^E]*?\s+(?:1|3)\s+Tr[^E]*?ET", re.S),
+        # Diagonal overlay drawn with a –25 point font
+        re.compile(r"BT[^B]*?/F\d+\s+-25\s+Tf[^E]*?ET", re.S),
+        # Horizontal footers drawn with 10‑point fonts
+        re.compile(r"BT[^B]*?/F\d+\s+10\s+Tf[^E]*?ET", re.S),
+        # Horizontal footers drawn with 18‑point fonts
+        re.compile(r"BT[^B]*?/F\d+\s+18\s+Tf[^E]*?ET", re.S),
+    ]
+    removed_count = 0
+    for page in doc:
+        # Iterate over all content streams referenced by this page
+        for xref in page.get_contents():
+            stream = doc.xref_stream(xref).decode("latin1")
+            modified = stream
+            for pat in patterns:
+                modified, count = pat.subn("", modified)
+                removed_count += count
+            if modified != stream:
+                # Update the stream in place
+                doc.update_stream(xref, modified.encode("latin1"))
+    return removed_count
+
+
+def build_output_path(input_path: str) -> str:
+    """Derive the output filename by inserting ``_clean`` before the extension."""
+    directory, filename = os.path.split(input_path)
+    base, ext = os.path.splitext(filename)
+    return os.path.join(directory, f"{base}_clean{ext}")
+
+
+def save_document(doc: fitz.Document, out_path: str) -> None:
+    """Save the document to ``out_path`` with a robust fallback.
+
+    PyMuPDF may raise errors when saving PDFs with damaged object streams or
+    encryption.  In such cases, attempt to repair the document by
+    exporting its bytes and reopening it as a new document, then save
+    again.  If the fallback also fails, the exception is propagated.
+    """
+    try:
+        doc.save(out_path, garbage=4, deflate=True)
+    except (ValueError, RuntimeError) as exc:
+        # Attempt to repair the document by exporting and reopening
+        print(f"Encountered an error while saving: {exc}. Attempting repair …")
+        # Export the document to bytes, performing garbage collection and
+        # decompression.  This will also decrypt the file if a password
+        # was used【206307526196463†L135-L154】.
+        buffer = doc.tobytes(garbage=4, deflate=True)
+        repaired = fitz.open("pdf", buffer)
+        repaired.save(out_path, garbage=4, deflate=True)
+        repaired.close()
+
+
+def sanitize_pdf(path: str) -> None:
+    """Perform watermark removal on ``path`` and write a cleaned copy."""
+    doc = open_with_password(path)
+    removed = remove_watermark_segments(doc)
+    out_path = build_output_path(path)
+    save_document(doc, out_path)
+    doc.close()
+    print(f"Removed {removed} watermark segment(s). Cleaned PDF saved as '{out_path}'.")
+
+
+def main(argv: list[str]) -> None:
+    if len(argv) != 2:
+        print("Usage: python enhanced_sanitize_pdf.py input.pdf")
+        return
+    sanitize_pdf(argv[1])
+
+
+if __name__ == "__main__":
+    main(sys.argv)
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+
+# Requirements for the SANS watermark removal tool
+#
+# This tool requires the PyMuPDF package to manipulate PDF files.  Install
+# it using pip before running the script:
+#   pip install -r requirements.txt
+
+PyMuPDF>=1.23.0