qsgen3/scripts/migrate_qs2_to_qs3.py

import re
import argparse
from pathlib import Path

# Tags that will be kept as HTML as they don't have direct Markdown equivalents
HTML_PASSTHROUGH_TAGS = {
    "#DV": "<div>", "#EDV": "</div>",
    "#SPN": "<span>", "#ESPN": "</span>",
    "#TBL": "<table>", "#ETBL": "</table>",
    "#TR": "<tr>", "#ETR": "</tr>",
    "#TD": "<td>", "#ETD": "</td>",
    "#TH": "<th>", "#ETH": "</th>",
    "#ART": "<article>", "#EART": "</article>",
    "#SEC": "<section>", "#ESEC": "</section>",
    "#ASIDE": "<aside>", "#EASIDE": "</aside>",
    "#NAV": "<nav>", "#ENAV": "</nav>",
    "#BTN": "<button>", "#EBTN": "</button>",
    "#SEL": "<select>", "#ESEL": "</select>",
    "#OPT": "<option>", "#EOPT": "</option>",
}

def sanitize_filename(name):
    """Sanitizes a string to be a valid filename."""
    name = name.lower()
    name = re.sub(r'\s+', '-', name)  # Replace spaces with hyphens
    name = re.sub(r'[^a-z0-9\-_.]', '', name)  # Remove unwanted characters
    name = re.sub(r'-+', '-', name)  # Replace multiple hyphens with single
    name = name.strip('-_')
    return name if name else "untitled"

def convert_qstags_to_markdown(content):
    """Converts qstags in content to Markdown syntax."""

    # Start with a copy to modify
    md_content = content

    # Links: #link URL¤TEXT¤ -> [TEXT](URL)
    md_content = re.sub(r'#link\s+([^¤]+)¤([^¤]+)¤', r'[\2](\1)', md_content)

    # Headings: #H1...#EH1 -> # ..., etc.
    for i in range(6, 0, -1):
        # Regex to capture content between #Hi and #EHi, case insensitive, dotall for newlines
        # Makes #EHi optional if it's at the end of a section or file.
        md_content = re.sub(r"#H{i}(.*?)(?:#EH{i}|$)".format(i=i),
                            r"{} \1".format("#"*i),
                            md_content, flags=re.IGNORECASE | re.DOTALL)
        # Clean up potential multiple newlines left by DOTALL capture if content was multi-line
        md_content = re.sub(r"({} .*?)\n\n".format("#"*i), r"\1\n", md_content)

    # Blockquotes: #Q...#EQ -> > ...
    # This is a simplified approach. For multi-line blockquotes, each line needs '>' prefix.
    # We'll capture the content and then process it line by line.
    def replace_blockquote(match):
        inner_content = match.group(1).strip()
        lines = inner_content.split('\n')
        return '\n'.join([f"> {line}" for line in lines]) + '\n'
    md_content = re.sub(r"#Q(.*?)(?:#EQ|$)", replace_blockquote, md_content, flags=re.IGNORECASE | re.DOTALL)

    # Ordered Lists: #OL ... #LI ... #ELI ... #EOL
    def replace_ordered_list(match_ol):
        ol_content = match_ol.group(1).strip()
        list_item_texts = re.findall(r"^[ \t]*#LI[ \t]*(.*?)[ \t]*(?:#ELI)?\s*$", ol_content, flags=re.IGNORECASE | re.MULTILINE)
        processed_items = []
        for i, item_text in enumerate(list_item_texts):
            processed_items.append(f"{i + 1}. {item_text.strip()}")
        return "\n".join(processed_items) + ("\n" if processed_items else "")

    md_content = re.sub(r"^[ \t]*#OL[ \t]*\n(.*?)\n^[ \t]*#EOL[ \t]*$", replace_ordered_list, md_content, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE)

    # Unordered Lists: #UL ... #LI ... #ELI ... #EUL
    def replace_unordered_list(match_ul):
        ul_content = match_ul.group(1).strip()
        list_item_texts = re.findall(r"^[ \t]*#LI[ \t]*(.*?)[ \t]*(?:#ELI)?\s*$", ul_content, flags=re.IGNORECASE | re.MULTILINE)
        processed_items = []
        for item_text in list_item_texts:
            processed_items.append(f"- {item_text.strip()}")
        return "\n".join(processed_items) + ("\n" if processed_items else "")

    md_content = re.sub(r"^[ \t]*#UL[ \t]*\n(.*?)\n^[ \t]*#EUL[ \t]*$", replace_unordered_list, md_content, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE)

    # Remove any stray #ELI tags if they weren't consumed by the #LI regex (unlikely but for cleanup)
    md_content = re.sub(r"^[ \t]*#ELI[ \t]*$", "", md_content, flags=re.IGNORECASE | re.MULTILINE)

    # Paragraphs: Remove #P, replace #EP with a newline to help separate blocks.
    # Markdown relies on blank lines between paragraphs.
    md_content = re.sub(r"#P\s*", "", md_content, flags=re.IGNORECASE)
    md_content = re.sub(r"\s*#EP", "\n", md_content, flags=re.IGNORECASE)

    # Inline elements
    md_content = re.sub(r"#BD(.*?)#EBD", r"**\1**", md_content, flags=re.IGNORECASE)
    md_content = re.sub(r"#STRONG(.*?)#ESTRONG", r"**\1**", md_content, flags=re.IGNORECASE)
    md_content = re.sub(r"#I(.*?)#EI", r"*\1*", md_content, flags=re.IGNORECASE)
    md_content = re.sub(r"#EM(.*?)#SEM", r"*\1*", md_content, flags=re.IGNORECASE) # Assuming #SEM is end tag for emphasis
    md_content = re.sub(r"#C(.*?)#EC", r"`\1`", md_content, flags=re.IGNORECASE)
    md_content = re.sub(r"#UD(.*?)#EUD", r"\1", md_content, flags=re.IGNORECASE) # Markdown has no underline, strip tags

    # Images: #showimg IMAGE_PATH¤ALT_TEXT¤ -> ![ALT_TEXT](PROCESSED_IMAGE_PATH)
    def process_image_path_for_markdown(raw_path):
        if raw_path.startswith(('http://', 'https://', '/')):
            return raw_path
        else:
            return f"/images/{raw_path}"

    def replace_showimg_to_markdown(match):
        raw_path = match.group(1)
        alt_text = match.group(2)
        processed_path = process_image_path_for_markdown(raw_path)
        return f"![{alt_text}]({processed_path})"
    md_content = re.sub(r'#showimg\s+([^¤]+)¤([^¤]+)¤', replace_showimg_to_markdown, md_content)

    # Linked Images: #linkimg IMAGE_PATH¤ALT_TEXT¤ -> [![ALT_TEXT](PROCESSED_IMAGE_PATH)](PROCESSED_IMAGE_PATH)
    def replace_linkimg_to_markdown(match):
        raw_path = match.group(1)
        alt_text = match.group(2)
        processed_path = process_image_path_for_markdown(raw_path) # Reusing the same path processor
        return f"[![{alt_text}]({processed_path})]({processed_path})"
    md_content = re.sub(r'#linkimg\s+([^¤]+)¤([^¤]+)¤', replace_linkimg_to_markdown, md_content)

    # YouTube Videos: #ytvideo YOUTUBE_ID -> HTML iframe
    def replace_ytvideo_to_html(match):
        video_id = match.group(1)
        return f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{video_id}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>'
    md_content = re.sub(r'#ytvideo\s+([A-Za-z0-9_\-]+)', replace_ytvideo_to_html, md_content)

    # Line break: #BR -> two spaces + newline
    md_content = md_content.replace("#BR", "  \n")

    # HTML Entities (these are fine as is, Markdown supports them)
    md_content = md_content.replace("#LT", "&lt;")
    md_content = md_content.replace("#GT", "&gt;")
    md_content = md_content.replace("#NUM", "&num;")

    # Passthrough HTML for tags without direct Markdown equivalents
    for qstag, html_tag in HTML_PASSTHROUGH_TAGS.items():
        md_content = md_content.replace(qstag, html_tag)

    # Final cleanup:
    # Normalize multiple blank lines to a single blank line (Markdown standard for paragraph separation)
    md_content = re.sub(r"\n\s*\n", "\n\n", md_content)
    # Remove leading/trailing whitespace from the whole content
    md_content = md_content.strip()

    return md_content

def process_blog_file(file_path, output_dir_base):
    """Processes a .blog file and creates a new Markdown file."""
    print(f"Processing blog file: {file_path}")
    content_lines = file_path.read_text().splitlines()

    metadata = {
        "title": "Untitled Post",
        "date": "",
        "layout": "post",
        "author": "Anonymous"
    }
    body_content = []

    # Extract date from filename (e.g., 20250530-3.blog)
    match_date_filename = re.match(r'(\d{8})-\d+\.blog', file_path.name)
    if match_date_filename:
        date_str = match_date_filename.group(1)
        metadata['date'] = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"
    else:
        print(f"  [WARN] Could not parse date from filename: {file_path.name}. Skipping date.")

    parsing_ingress = False
    parsing_body = False

    for line in content_lines:
        if line.startswith("DATE "):
            # DATE field in file is secondary to filename date for posts
            pass
        elif line.startswith("BLOG_TITLE "):
            metadata['title'] = line.replace("BLOG_TITLE ", "", 1).strip()
        elif line.strip() == "#INGRESS_START":
            parsing_ingress = True
            continue
        elif line.strip() == "#INGRESS_STOP":
            parsing_ingress = False
            continue
        elif line.strip() == "#BODY_START":
            parsing_body = True
            continue
        elif line.strip() == "#BODY_STOP":
            parsing_body = False
            continue

        if parsing_ingress or parsing_body:
            body_content.append(line)

    markdown_body = convert_qstags_to_markdown("\n".join(body_content))

    escaped_title = metadata['title'].replace('"', '\\"') # Escape for YAML
    frontmatter = [
        "---",
        f'title: "{escaped_title}"', # Use single quotes for f-string, double for YAML value
        f"date: {metadata['date']}",
        f"layout: {metadata['layout']}",
        f"author: {metadata['author']}",
        "---",
        ""
    ]

    output_content = "\n".join(frontmatter) + markdown_body

    sanitized_title = sanitize_filename(metadata['title'])
    if not metadata['date']:
        # Fallback if date couldn't be parsed, though unlikely for .blog files
        output_subdir = Path(output_dir_base) / "blog" / "unknown_date"
    else:
        year, month, day = metadata['date'].split('-')
        output_subdir = Path(output_dir_base) / "blog" / year / month / day

    output_subdir.mkdir(parents=True, exist_ok=True)
    output_file_path = output_subdir / f"{sanitized_title}.md"

    output_file_path.write_text(output_content)
    print(f"  -> Created: {output_file_path}")

def process_qst_file(file_path, output_dir_base):
    """Processes a .qst file and creates a new Markdown file."""
    print(f"Processing page file: {file_path}")
    content_lines = file_path.read_text().splitlines()

    metadata = {
        "title": "Untitled Page",
        "layout": "page",
        "author": "Anonymous" # Added for consistency
    }
    body_content_lines = []

    if content_lines and content_lines[0].startswith("#title="):
        metadata['title'] = content_lines[0].replace("#title=", "", 1).strip()
        body_content_lines = content_lines[1:]
    else:
        print(f"  [WARN] No #title= found in {file_path.name}. Using filename as title.")
        metadata['title'] = file_path.stem
        body_content_lines = content_lines

    markdown_body = convert_qstags_to_markdown("\n".join(body_content_lines))

    escaped_title = metadata['title'].replace('"', '\\"') # Escape for YAML
    frontmatter = [
        "---",
        f'title: "{escaped_title}"', # Use single quotes for f-string, double for YAML value
        f"layout: {metadata['layout']}",
        f"author: {metadata['author']}",
        "---",
        ""
    ]

    output_content = "\n".join(frontmatter) + markdown_body

    sanitized_title = sanitize_filename(metadata['title'])
    # Pages go into the root of the output_dir_base (e.g. content/)
    output_file_path = Path(output_dir_base) / f"{sanitized_title}.md"

    output_file_path.write_text(output_content)
    print(f"  -> Created: {output_file_path}")

def main():
    parser = argparse.ArgumentParser(description="Migrate qsgen2 (.blog, .qst) files to qsgen3 Markdown format.")
    parser.add_argument("--source-dir", required=True, help="Directory containing old .blog and .qst files.")
    parser.add_argument("--output-dir", required=True, help="Directory to save new Markdown files (e.g., your qsgen3 'content' directory).")
    args = parser.parse_args()

    source_path = Path(args.source_dir)
    output_path = Path(args.output_dir)

    if not source_path.is_dir():
        print(f"Error: Source directory '{source_path}' not found or not a directory.")
        return

    output_path.mkdir(parents=True, exist_ok=True)
    print(f"Source directory: {source_path.resolve()}")
    print(f"Output directory: {output_path.resolve()}")

    for item in source_path.rglob('*'): # rglob to find in subdirectories too, if any
        if item.is_file():
            if item.name.endswith(".blog"):
                process_blog_file(item, output_path)
            elif item.name.endswith(".qst"):
                process_qst_file(item, output_path)

    print("\nMigration complete.")

if __name__ == "__main__":
    main()