160 lines
6.2 KiB
Python
160 lines
6.2 KiB
Python
"""
|
||
enhanced_sanitize_pdf.py
|
||
========================
|
||
|
||
This utility removes SANS‑style watermark overlays (names, e‑mails, hashes,
|
||
dates and licence lines) from a PDF without leaving selection boxes or
|
||
visible artefacts. It can open password‑protected files by prompting the
|
||
user for a password and will automatically write the cleaned document
|
||
alongside the original with ``_clean`` appended to its file name.
|
||
|
||
In addition to the basic cleaning implemented in ``sanitize_pdf_watermark.py``,
|
||
this version includes two important improvements:
|
||
|
||
* **Password handling:** If the input PDF is encrypted, the script will ask
|
||
for a password using ``input()`` and attempt to authenticate before
|
||
continuing. Should the password prove wrong, the script aborts with a
|
||
helpful message. When processing unencrypted files, no prompt will be
|
||
shown.
|
||
|
||
* **Robust saving:** Certain PDFs have corrupted object streams or other
|
||
structural issues that prevent PyMuPDF from saving them directly. If a
|
||
``ValueError`` or ``RuntimeError`` is raised during ``doc.save()``, the
|
||
script falls back to creating a repaired copy in memory using
|
||
``doc.tobytes(garbage=4, deflate=True)``. It then opens this repaired
|
||
content as a new document and saves it to disk. This technique is
|
||
recommended by the PyMuPDF documentation for recovering from broken
|
||
object streams【206307526196463†L135-L154】.
|
||
|
||
Usage::
|
||
|
||
python enhanced_sanitize_pdf.py path/to/input.pdf
|
||
|
||
The cleaned PDF will be created in the same directory as ``input.pdf`` with
|
||
``_clean`` appended before the ``.pdf`` extension. The script prints the
|
||
number of watermark segments removed.
|
||
|
||
This script requires the PyMuPDF (``fitz``) package.
|
||
"""
|
||
|
||
import sys
|
||
import os
|
||
import re
|
||
from typing import Tuple, Optional
|
||
|
||
import fitz # type: ignore[import]
|
||
|
||
|
||
def open_with_password(path: str) -> fitz.Document:
|
||
"""Open a PDF file, prompting the user for a password if necessary.
|
||
|
||
If the file is encrypted, PyMuPDF raises a ``RuntimeError``. In that
|
||
case the user is prompted to enter the password. If authentication
|
||
fails, a ``ValueError`` is raised and the program aborts.
|
||
"""
|
||
try:
|
||
return fitz.open(path)
|
||
except RuntimeError:
|
||
# Document appears to be encrypted
|
||
print(f"The document '{path}' is encrypted.")
|
||
password = input("Please enter the password: ")
|
||
doc = fitz.open(path)
|
||
if not doc.authenticate(password):
|
||
raise ValueError("Incorrect password provided. Unable to open the PDF.")
|
||
return doc
|
||
|
||
|
||
def remove_watermark_segments(doc: fitz.Document) -> int:
|
||
"""Remove known watermark text blocks from every page of the document.
|
||
|
||
Watermark segments are identified by unusual font sizes and text
|
||
rendering modes used exclusively by the SANS licence overlays:
|
||
|
||
* Invisible 36‑point text with rendering mode 1 or 3 (hidden names,
|
||
e‑mails, numeric codes and hashes).
|
||
* Rotated overlays drawn with a –25 point font.
|
||
* Footer lines drawn with 10‑ or 18‑point fonts containing the
|
||
licence owner’s information.
|
||
|
||
The patterns below use regular expressions to locate complete
|
||
``BT … ET`` text objects matching these criteria and delete them.
|
||
|
||
Returns
|
||
-------
|
||
int
|
||
The total number of removed segments across all pages.
|
||
"""
|
||
# Compile regular expressions once for efficiency
|
||
patterns = [
|
||
# Invisible 36‑point text with rendering mode 1 or 3
|
||
re.compile(r"BT[^B]*?/F\d+\s+36\s+Tf[^E]*?\s+(?:1|3)\s+Tr[^E]*?ET", re.S),
|
||
# Diagonal overlay drawn with a –25 point font
|
||
re.compile(r"BT[^B]*?/F\d+\s+-25\s+Tf[^E]*?ET", re.S),
|
||
# Horizontal footers drawn with 10‑point fonts
|
||
re.compile(r"BT[^B]*?/F\d+\s+10\s+Tf[^E]*?ET", re.S),
|
||
# Horizontal footers drawn with 18‑point fonts
|
||
re.compile(r"BT[^B]*?/F\d+\s+18\s+Tf[^E]*?ET", re.S),
|
||
]
|
||
removed_count = 0
|
||
for page in doc:
|
||
# Iterate over all content streams referenced by this page
|
||
for xref in page.get_contents():
|
||
stream = doc.xref_stream(xref).decode("latin1")
|
||
modified = stream
|
||
for pat in patterns:
|
||
modified, count = pat.subn("", modified)
|
||
removed_count += count
|
||
if modified != stream:
|
||
# Update the stream in place
|
||
doc.update_stream(xref, modified.encode("latin1"))
|
||
return removed_count
|
||
|
||
|
||
def build_output_path(input_path: str) -> str:
|
||
"""Derive the output filename by inserting ``_clean`` before the extension."""
|
||
directory, filename = os.path.split(input_path)
|
||
base, ext = os.path.splitext(filename)
|
||
return os.path.join(directory, f"{base}_clean{ext}")
|
||
|
||
|
||
def save_document(doc: fitz.Document, out_path: str) -> None:
|
||
"""Save the document to ``out_path`` with a robust fallback.
|
||
|
||
PyMuPDF may raise errors when saving PDFs with damaged object streams or
|
||
encryption. In such cases, attempt to repair the document by
|
||
exporting its bytes and reopening it as a new document, then save
|
||
again. If the fallback also fails, the exception is propagated.
|
||
"""
|
||
try:
|
||
doc.save(out_path, garbage=4, deflate=True)
|
||
except (ValueError, RuntimeError) as exc:
|
||
# Attempt to repair the document by exporting and reopening
|
||
print(f"Encountered an error while saving: {exc}. Attempting repair …")
|
||
# Export the document to bytes, performing garbage collection and
|
||
# decompression. This will also decrypt the file if a password
|
||
# was used【206307526196463†L135-L154】.
|
||
buffer = doc.tobytes(garbage=4, deflate=True)
|
||
repaired = fitz.open("pdf", buffer)
|
||
repaired.save(out_path, garbage=4, deflate=True)
|
||
repaired.close()
|
||
|
||
|
||
def sanitize_pdf(path: str) -> None:
|
||
"""Perform watermark removal on ``path`` and write a cleaned copy."""
|
||
doc = open_with_password(path)
|
||
removed = remove_watermark_segments(doc)
|
||
out_path = build_output_path(path)
|
||
save_document(doc, out_path)
|
||
doc.close()
|
||
print(f"Removed {removed} watermark segment(s). Cleaned PDF saved as '{out_path}'.")
|
||
|
||
|
||
def main(argv: list[str]) -> None:
|
||
if len(argv) != 2:
|
||
print("Usage: python enhanced_sanitize_pdf.py input.pdf")
|
||
return
|
||
sanitize_pdf(argv[1])
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main(sys.argv) |