docker_file_analysis/scripts/convert-remnux-cheats.py

#!/usr/bin/env python3

"""
convert-remnux-cheats.py

Parse a consolidated markdown cheatsheet (e.g., remnux-tldr-cheatsheet.md)
and generate cheat sheets and TLDR pages for tools missing from the system.

Heuristics:
- A tool section is detected by a heading line (## or ###). The heading text
  is the tool display name. If parentheses provide aliases, the first alias is
  used as the canonical filename (e.g., "7-Zip (7z, 7za)" => 7z).
- Within a section, bullet lines ("- " or "* ") are treated as descriptions.
  Inline code `...` on those lines is extracted as commands.
- Fenced code blocks (``` ... ```) are extracted; each non-empty line becomes a command entry.
- TLDR entries: produce simple bullets with the command in backticks and the description when available.
- Cheat entries: print description as a comment line, followed by the command.

By default, only missing files are created. Existing cheat or TLDR files are left intact
unless --overwrite is provided.
"""

import argparse
import os
import re
from pathlib import Path

HDR_RE = re.compile(r"^(#{2,3})\s+(.+?)\s*$")
FENCE_RE = re.compile(r"^```(.*)$")
INLINE_CODE_RE = re.compile(r"`([^`]+)`")


def choose_canonical_name(heading: str) -> str:
    """From a heading like "7-Zip (7z, 7za, 7zr, 7zz)" choose the first alias if present,
    else sanitize the heading text to a plausible command name."""
    text = heading.strip()
    # If parentheses include aliases, pick the first alias
    m = re.search(r"\(([^)]+)\)", text)
    if m:
        inner = m.group(1)
        # split on comma or whitespace
        cand = re.split(r"[\s,]+", inner.strip())[0]
        if cand:
            return cand
    # Otherwise, drop parentheses and after a dash if looks like "Tool - description"
    text = re.sub(r"\(.*?\)", "", text)
    text = text.split(" - ")[0]
    text = text.strip()
    # Replace spaces with hyphens, keep .py if present
    # Remove characters not typical for command names
    safe = re.sub(r"[^a-zA-Z0-9._+-]", "-", text).strip("-")
    # Lowercase for TLDR filenames; cheat files can be exact
    return safe


def parse_sections(lines):
    sections = []  # list of dicts: {name, display, desc_cmds:[(desc, cmd)], cmds:[cmd]}
    current = None
    in_fence = False
    fence_accum = []

    bullet_desc = None  # last bullet description without inline code

    for raw in lines:
        line = raw.rstrip("\n")
        h = HDR_RE.match(line)
        if h and not in_fence:
            # start new section
            if current:
                sections.append(current)
            hdr_text = h.group(2).strip()
            current = {
                "display": hdr_text,
                "name": choose_canonical_name(hdr_text),
                "desc_cmds": [],  # list of (desc, cmd)
                "cmds": []        # list of commands (from code fences)
            }
            bullet_desc = None
            continue

        # fence handling
        f = FENCE_RE.match(line)
        if f:
            if not in_fence:
                in_fence = True
                fence_accum = []
            else:
                # closing fence: flush accumulated commands
                in_fence = False
                if current and fence_accum:
                    for cmdline in fence_accum:
                        cmdline = cmdline.strip()
                        if cmdline:
                            current["cmds"].append(cmdline)
                fence_accum = []
            continue

        if in_fence:
            fence_accum.append(line)
            continue

        # bullets with potential inline code
        if re.match(r"^\s*[-*]\s+", line):
            # extract inline code first
            codes = INLINE_CODE_RE.findall(line)
            # remove code spans for a cleaner description
            desc = INLINE_CODE_RE.sub("{}", line)
            desc = re.sub(r"^\s*[-*]\s+", "", desc).strip()
            if current is not None:
                if codes:
                    for code in codes:
                        current["desc_cmds"].append((desc, code.strip()))
                else:
                    # bullet with no code -> remember as a description context for next code block lines if desired
                    bullet_desc = desc
            continue

        # plain text lines with inline code, treat similarly
        codes = INLINE_CODE_RE.findall(line)
        if codes and current is not None:
            desc = INLINE_CODE_RE.sub("{}", line).strip()
            for code in codes:
                current["desc_cmds"].append((desc, code.strip()))
            continue

        # otherwise ignore

    if current:
        sections.append(current)
    return sections


def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)


def write_cheat(tool_name: str, section, out_dir: Path, overwrite: bool) -> bool:
    # cheat file path chosen as plain name without extension (.cheat not required)
    filename = tool_name
    cheat_path = out_dir / filename
    if cheat_path.exists() and not overwrite:
        return False

    lines = []
    # Header as comment
    lines.append(f"# {section['display']}")
    lines.append("")

    # From desc_cmds (description + single command)
    for desc, cmd in section.get("desc_cmds", []):
        if desc:
            lines.append(f"# {desc}")
        lines.append(cmd)
        lines.append("")

    # From cmds (code fences)
    for cmd in section.get("cmds", []):
        lines.append(cmd)
        lines.append("")

    # Write
    cheat_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
    return True


def write_tldr(tool_name: str, section, out_dir: Path, overwrite: bool) -> bool:
    # TLDR pages expect lowercase filenames with .md
    filename = tool_name.lower()
    tldr_path = out_dir / f"{filename}.md"
    if tldr_path.exists() and not overwrite:
        return False

    md = []
    md.append(f"# {tool_name}")
    md.append("")
    md.append("> Generated from remnux-tldr-cheatsheet.md. Review for accuracy.")
    md.append("")

    # Emit desc_cmds first
    for desc, cmd in section.get("desc_cmds", []):
        if desc:
            md.append(f"- {desc}:")
        else:
            md.append(f"- Example:")
        md.append("")
        md.append(f"`{cmd}`")
        md.append("")

    # Emit code-fence commands as generic examples
    for cmd in section.get("cmds", []):
        md.append(f"- Example:")
        md.append("")
        md.append(f"`{cmd}`")
        md.append("")

    tldr_path.write_text("\n".join(md).rstrip() + "\n", encoding="utf-8")
    return True


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--in", dest="infile", required=True, help="Path to remnux-tldr-cheatsheet.md")
    ap.add_argument("--cheat-dir", required=True, help="Output directory for cheat sheets")
    ap.add_argument("--tldr-dir", required=True, help="Output directory for TLDR pages")
    ap.add_argument("--only-missing", action="store_true", help="Only create files that don't already exist")
    ap.add_argument("--overwrite", action="store_true", help="Overwrite existing files")
    args = ap.parse_args()

    if args.overwrite and args.only_missing:
        print("[!] --overwrite and --only-missing are mutually exclusive; using --overwrite")
        args.only_missing = False

    text = Path(args.infile).read_text(encoding="utf-8", errors="replace").splitlines()
    sections = parse_sections(text)

    cheat_dir = Path(args.cheat_dir)
    tldr_dir = Path(args.tldr_dir)
    ensure_dir(cheat_dir)
    ensure_dir(tldr_dir)

    created_cheat = created_tldr = 0
    skipped_cheat = skipped_tldr = 0

    for sec in sections:
        tool_name = sec["name"]
        # Write cheat
        c_written = write_cheat(tool_name, sec, cheat_dir, overwrite=not args.only_missing)
        if c_written:
            created_cheat += 1
        else:
            skipped_cheat += 1
        # Write tldr
        t_written = write_tldr(tool_name, sec, tldr_dir, overwrite=not args.only_missing)
        if t_written:
            created_tldr += 1
        else:
            skipped_tldr += 1

    print(f"✅ Conversion complete: cheat created={created_cheat}, skipped={skipped_cheat}; tldr created={created_tldr}, skipped={skipped_tldr}")


if __name__ == "__main__":
    main()