first version

This commit is contained in:
tobias
2025-11-16 22:30:36 +01:00
parent 3d3625c93a
commit bd42e2f7e3
3 changed files with 203 additions and 0 deletions

View File

@@ -0,0 +1,35 @@
# SANS Courseware Watermark Remover
Removes personalized SANS “Licensed To …” watermarks (names, emails, hashes, dates) by deleting the actual `BT…ET` text objects in the PDF—no white boxes, no selection artefacts.
## Quickstart
1. **(If encrypted) Decrypt first with `qpdf`:**
```bash
qpdf --password='<YOUR_PASSWORD>' --decrypt INPUT.pdf INPUT_unlocked.pdf
```
2. **Install dependencies (Python 3.9+):**
```bash
python -m venv .venv
source .venv/bin/activate # Windows: .venv\Scripts\activate
pip install -r requirements.txt
```
3. **Run the sanitizer (auto-creates `<input>_clean.pdf`):**
4.
```bash
# Recommended latest script:
python enhanced_sanitize_pdf.py INPUT_unlocked.pdf
```
## Notes
* The tool targets common SANS watermark patterns:
* Invisible 36-pt text with rendering modes 1/3,
* Rotated diagonal overlay using 25-pt fonts,
* Footer lines at 10/18-pt.
* If your course PDFs use different fonts/sizes, adjust the regex patterns inside the script.

160
pdf_sanatizer.py Normal file
View File

@@ -0,0 +1,160 @@
"""
enhanced_sanitize_pdf.py
========================
This utility removes SANSstyle watermark overlays (names, emails, hashes,
dates and licence lines) from a PDF without leaving selection boxes or
visible artefacts. It can open passwordprotected files by prompting the
user for a password and will automatically write the cleaned document
alongside the original with ``_clean`` appended to its file name.
In addition to the basic cleaning implemented in ``sanitize_pdf_watermark.py``,
this version includes two important improvements:
* **Password handling:** If the input PDF is encrypted, the script will ask
for a password using ``input()`` and attempt to authenticate before
continuing. Should the password prove wrong, the script aborts with a
helpful message. When processing unencrypted files, no prompt will be
shown.
* **Robust saving:** Certain PDFs have corrupted object streams or other
structural issues that prevent PyMuPDF from saving them directly. If a
``ValueError`` or ``RuntimeError`` is raised during ``doc.save()``, the
script falls back to creating a repaired copy in memory using
``doc.tobytes(garbage=4, deflate=True)``. It then opens this repaired
content as a new document and saves it to disk. This technique is
recommended by the PyMuPDF documentation for recovering from broken
object streams【206307526196463†L135-L154】.
Usage::
python enhanced_sanitize_pdf.py path/to/input.pdf
The cleaned PDF will be created in the same directory as ``input.pdf`` with
``_clean`` appended before the ``.pdf`` extension. The script prints the
number of watermark segments removed.
This script requires the PyMuPDF (``fitz``) package.
"""
import sys
import os
import re
from typing import Tuple, Optional
import fitz # type: ignore[import]
def open_with_password(path: str) -> fitz.Document:
"""Open a PDF file, prompting the user for a password if necessary.
If the file is encrypted, PyMuPDF raises a ``RuntimeError``. In that
case the user is prompted to enter the password. If authentication
fails, a ``ValueError`` is raised and the program aborts.
"""
try:
return fitz.open(path)
except RuntimeError:
# Document appears to be encrypted
print(f"The document '{path}' is encrypted.")
password = input("Please enter the password: ")
doc = fitz.open(path)
if not doc.authenticate(password):
raise ValueError("Incorrect password provided. Unable to open the PDF.")
return doc
def remove_watermark_segments(doc: fitz.Document) -> int:
"""Remove known watermark text blocks from every page of the document.
Watermark segments are identified by unusual font sizes and text
rendering modes used exclusively by the SANS licence overlays:
* Invisible 36point text with rendering mode 1 or 3 (hidden names,
emails, numeric codes and hashes).
* Rotated overlays drawn with a 25 point font.
* Footer lines drawn with 10 or 18point fonts containing the
licence owners information.
The patterns below use regular expressions to locate complete
``BT … ET`` text objects matching these criteria and delete them.
Returns
-------
int
The total number of removed segments across all pages.
"""
# Compile regular expressions once for efficiency
patterns = [
# Invisible 36point text with rendering mode 1 or 3
re.compile(r"BT[^B]*?/F\d+\s+36\s+Tf[^E]*?\s+(?:1|3)\s+Tr[^E]*?ET", re.S),
# Diagonal overlay drawn with a 25 point font
re.compile(r"BT[^B]*?/F\d+\s+-25\s+Tf[^E]*?ET", re.S),
# Horizontal footers drawn with 10point fonts
re.compile(r"BT[^B]*?/F\d+\s+10\s+Tf[^E]*?ET", re.S),
# Horizontal footers drawn with 18point fonts
re.compile(r"BT[^B]*?/F\d+\s+18\s+Tf[^E]*?ET", re.S),
]
removed_count = 0
for page in doc:
# Iterate over all content streams referenced by this page
for xref in page.get_contents():
stream = doc.xref_stream(xref).decode("latin1")
modified = stream
for pat in patterns:
modified, count = pat.subn("", modified)
removed_count += count
if modified != stream:
# Update the stream in place
doc.update_stream(xref, modified.encode("latin1"))
return removed_count
def build_output_path(input_path: str) -> str:
"""Derive the output filename by inserting ``_clean`` before the extension."""
directory, filename = os.path.split(input_path)
base, ext = os.path.splitext(filename)
return os.path.join(directory, f"{base}_clean{ext}")
def save_document(doc: fitz.Document, out_path: str) -> None:
"""Save the document to ``out_path`` with a robust fallback.
PyMuPDF may raise errors when saving PDFs with damaged object streams or
encryption. In such cases, attempt to repair the document by
exporting its bytes and reopening it as a new document, then save
again. If the fallback also fails, the exception is propagated.
"""
try:
doc.save(out_path, garbage=4, deflate=True)
except (ValueError, RuntimeError) as exc:
# Attempt to repair the document by exporting and reopening
print(f"Encountered an error while saving: {exc}. Attempting repair …")
# Export the document to bytes, performing garbage collection and
# decompression. This will also decrypt the file if a password
# was used【206307526196463†L135-L154】.
buffer = doc.tobytes(garbage=4, deflate=True)
repaired = fitz.open("pdf", buffer)
repaired.save(out_path, garbage=4, deflate=True)
repaired.close()
def sanitize_pdf(path: str) -> None:
"""Perform watermark removal on ``path`` and write a cleaned copy."""
doc = open_with_password(path)
removed = remove_watermark_segments(doc)
out_path = build_output_path(path)
save_document(doc, out_path)
doc.close()
print(f"Removed {removed} watermark segment(s). Cleaned PDF saved as '{out_path}'.")
def main(argv: list[str]) -> None:
if len(argv) != 2:
print("Usage: python enhanced_sanitize_pdf.py input.pdf")
return
sanitize_pdf(argv[1])
if __name__ == "__main__":
main(sys.argv)

8
requirements.txt Normal file
View File

@@ -0,0 +1,8 @@
# Requirements for the SANS watermark removal tool
#
# This tool requires the PyMuPDF package to manipulate PDF files. Install
# it using pip before running the script:
# pip install -r requirements.txt
PyMuPDF>=1.23.0