import re import argparse from pathlib import Path # Tags that will be kept as HTML as they don't have direct Markdown equivalents HTML_PASSTHROUGH_TAGS = { "#DV": "
", "#EDV": "
", "#SPN": "", "#ESPN": "", "#TBL": "", "#ETBL": "
", "#TR": "", "#ETR": "", "#TD": "", "#ETD": "", "#TH": "", "#ETH": "", "#ART": "
", "#EART": "
", "#SEC": "
", "#ESEC": "
", "#ASIDE": "", "#NAV": "", "#BTN": "", "#SEL": "", "#OPT": "", } def sanitize_filename(name): """Sanitizes a string to be a valid filename.""" name = name.lower() name = re.sub(r'\s+', '-', name) # Replace spaces with hyphens name = re.sub(r'[^a-z0-9\-_.]', '', name) # Remove unwanted characters name = re.sub(r'-+', '-', name) # Replace multiple hyphens with single name = name.strip('-_') return name if name else "untitled" def convert_qstags_to_markdown(content): """Converts qstags in content to Markdown syntax.""" # Start with a copy to modify md_content = content # Links: #link URL¤TEXT¤ -> [TEXT](URL) md_content = re.sub(r'#link\s+([^¤]+)¤([^¤]+)¤', r'[\2](\1)', md_content) # Headings: #H1...#EH1 -> # ..., etc. for i in range(6, 0, -1): # Regex to capture content between #Hi and #EHi, case insensitive, dotall for newlines # Makes #EHi optional if it's at the end of a section or file. md_content = re.sub(r"#H{i}(.*?)(?:#EH{i}|$)".format(i=i), r"{} \1".format("#"*i), md_content, flags=re.IGNORECASE | re.DOTALL) # Clean up potential multiple newlines left by DOTALL capture if content was multi-line md_content = re.sub(r"({} .*?)\n\n".format("#"*i), r"\1\n", md_content) # Blockquotes: #Q...#EQ -> > ... # This is a simplified approach. For multi-line blockquotes, each line needs '>' prefix. # We'll capture the content and then process it line by line. def replace_blockquote(match): inner_content = match.group(1).strip() lines = inner_content.split('\n') return '\n'.join([f"> {line}" for line in lines]) + '\n' md_content = re.sub(r"#Q(.*?)(?:#EQ|$)", replace_blockquote, md_content, flags=re.IGNORECASE | re.DOTALL) # Ordered Lists: #OL ... #LI ... #ELI ... #EOL def replace_ordered_list(match_ol): ol_content = match_ol.group(1).strip() list_item_texts = re.findall(r"^[ \t]*#LI[ \t]*(.*?)[ \t]*(?:#ELI)?\s*$", ol_content, flags=re.IGNORECASE | re.MULTILINE) processed_items = [] for i, item_text in enumerate(list_item_texts): processed_items.append(f"{i + 1}. {item_text.strip()}") return "\n".join(processed_items) + ("\n" if processed_items else "") md_content = re.sub(r"^[ \t]*#OL[ \t]*\n(.*?)\n^[ \t]*#EOL[ \t]*$", replace_ordered_list, md_content, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE) # Unordered Lists: #UL ... #LI ... #ELI ... #EUL def replace_unordered_list(match_ul): ul_content = match_ul.group(1).strip() list_item_texts = re.findall(r"^[ \t]*#LI[ \t]*(.*?)[ \t]*(?:#ELI)?\s*$", ul_content, flags=re.IGNORECASE | re.MULTILINE) processed_items = [] for item_text in list_item_texts: processed_items.append(f"- {item_text.strip()}") return "\n".join(processed_items) + ("\n" if processed_items else "") md_content = re.sub(r"^[ \t]*#UL[ \t]*\n(.*?)\n^[ \t]*#EUL[ \t]*$", replace_unordered_list, md_content, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE) # Remove any stray #ELI tags if they weren't consumed by the #LI regex (unlikely but for cleanup) md_content = re.sub(r"^[ \t]*#ELI[ \t]*$", "", md_content, flags=re.IGNORECASE | re.MULTILINE) # Paragraphs: Remove #P, replace #EP with a newline to help separate blocks. # Markdown relies on blank lines between paragraphs. md_content = re.sub(r"#P\s*", "", md_content, flags=re.IGNORECASE) md_content = re.sub(r"\s*#EP", "\n", md_content, flags=re.IGNORECASE) # Inline elements md_content = re.sub(r"#BD(.*?)#EBD", r"**\1**", md_content, flags=re.IGNORECASE) md_content = re.sub(r"#STRONG(.*?)#ESTRONG", r"**\1**", md_content, flags=re.IGNORECASE) md_content = re.sub(r"#I(.*?)#EI", r"*\1*", md_content, flags=re.IGNORECASE) md_content = re.sub(r"#EM(.*?)#SEM", r"*\1*", md_content, flags=re.IGNORECASE) # Assuming #SEM is end tag for emphasis md_content = re.sub(r"#C(.*?)#EC", r"`\1`", md_content, flags=re.IGNORECASE) md_content = re.sub(r"#UD(.*?)#EUD", r"\1", md_content, flags=re.IGNORECASE) # Markdown has no underline, strip tags # Images: #showimg IMAGE_PATH¤ALT_TEXT¤ -> ![ALT_TEXT](PROCESSED_IMAGE_PATH) def process_image_path_for_markdown(raw_path): if raw_path.startswith(('http://', 'https://', '/')): return raw_path else: return f"/images/{raw_path}" def replace_showimg_to_markdown(match): raw_path = match.group(1) alt_text = match.group(2) processed_path = process_image_path_for_markdown(raw_path) return f"![{alt_text}]({processed_path})" md_content = re.sub(r'#showimg\s+([^¤]+)¤([^¤]+)¤', replace_showimg_to_markdown, md_content) # Linked Images: #linkimg IMAGE_PATH¤ALT_TEXT¤ -> [![ALT_TEXT](PROCESSED_IMAGE_PATH)](PROCESSED_IMAGE_PATH) def replace_linkimg_to_markdown(match): raw_path = match.group(1) alt_text = match.group(2) processed_path = process_image_path_for_markdown(raw_path) # Reusing the same path processor return f"[![{alt_text}]({processed_path})]({processed_path})" md_content = re.sub(r'#linkimg\s+([^¤]+)¤([^¤]+)¤', replace_linkimg_to_markdown, md_content) # YouTube Videos: #ytvideo YOUTUBE_ID -> HTML iframe def replace_ytvideo_to_html(match): video_id = match.group(1) return f'' md_content = re.sub(r'#ytvideo\s+([A-Za-z0-9_\-]+)', replace_ytvideo_to_html, md_content) # Line break: #BR -> two spaces + newline md_content = md_content.replace("#BR", " \n") # HTML Entities (these are fine as is, Markdown supports them) md_content = md_content.replace("#LT", "<") md_content = md_content.replace("#GT", ">") md_content = md_content.replace("#NUM", "#") # Passthrough HTML for tags without direct Markdown equivalents for qstag, html_tag in HTML_PASSTHROUGH_TAGS.items(): md_content = md_content.replace(qstag, html_tag) # Final cleanup: # Normalize multiple blank lines to a single blank line (Markdown standard for paragraph separation) md_content = re.sub(r"\n\s*\n", "\n\n", md_content) # Remove leading/trailing whitespace from the whole content md_content = md_content.strip() return md_content def process_blog_file(file_path, output_dir_base): """Processes a .blog file and creates a new Markdown file.""" print(f"Processing blog file: {file_path}") content_lines = file_path.read_text().splitlines() metadata = { "title": "Untitled Post", "date": "", "layout": "post", "author": "Anonymous" } body_content = [] # Extract date from filename (e.g., 20250530-3.blog) match_date_filename = re.match(r'(\d{8})-\d+\.blog', file_path.name) if match_date_filename: date_str = match_date_filename.group(1) metadata['date'] = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}" else: print(f" [WARN] Could not parse date from filename: {file_path.name}. Skipping date.") parsing_ingress = False parsing_body = False for line in content_lines: if line.startswith("DATE "): # DATE field in file is secondary to filename date for posts pass elif line.startswith("BLOG_TITLE "): metadata['title'] = line.replace("BLOG_TITLE ", "", 1).strip() elif line.strip() == "#INGRESS_START": parsing_ingress = True continue elif line.strip() == "#INGRESS_STOP": parsing_ingress = False continue elif line.strip() == "#BODY_START": parsing_body = True continue elif line.strip() == "#BODY_STOP": parsing_body = False continue if parsing_ingress or parsing_body: body_content.append(line) markdown_body = convert_qstags_to_markdown("\n".join(body_content)) escaped_title = metadata['title'].replace('"', '\\"') # Escape for YAML frontmatter = [ "---", f'title: "{escaped_title}"', # Use single quotes for f-string, double for YAML value f"date: {metadata['date']}", f"layout: {metadata['layout']}", f"author: {metadata['author']}", "---", "" ] output_content = "\n".join(frontmatter) + markdown_body sanitized_title = sanitize_filename(metadata['title']) if not metadata['date']: # Fallback if date couldn't be parsed, though unlikely for .blog files output_subdir = Path(output_dir_base) / "blog" else: # Put all blog posts in a single directory instead of year/month/day structure output_subdir = Path(output_dir_base) / "blog" output_subdir.mkdir(parents=True, exist_ok=True) output_file_path = output_subdir / f"{sanitized_title}.md" output_file_path.write_text(output_content) print(f" -> Created: {output_file_path}") def process_qst_file(file_path, output_dir_base): """Processes a .qst file and creates a new Markdown file.""" print(f"Processing page file: {file_path}") content_lines = file_path.read_text().splitlines() metadata = { "title": "Untitled Page", "layout": "page", "author": "Anonymous" # Added for consistency } body_content_lines = [] if content_lines and content_lines[0].startswith("#title="): metadata['title'] = content_lines[0].replace("#title=", "", 1).strip() body_content_lines = content_lines[1:] else: print(f" [WARN] No #title= found in {file_path.name}. Using filename as title.") metadata['title'] = file_path.stem body_content_lines = content_lines markdown_body = convert_qstags_to_markdown("\n".join(body_content_lines)) escaped_title = metadata['title'].replace('"', '\\"') # Escape for YAML frontmatter = [ "---", f'title: "{escaped_title}"', # Use single quotes for f-string, double for YAML value f"layout: {metadata['layout']}", f"author: {metadata['author']}", "---", "" ] output_content = "\n".join(frontmatter) + markdown_body sanitized_title = sanitize_filename(metadata['title']) # Pages go into the root of the output_dir_base (e.g. content/) output_file_path = Path(output_dir_base) / f"{sanitized_title}.md" output_file_path.write_text(output_content) print(f" -> Created: {output_file_path}") def main(): parser = argparse.ArgumentParser(description="Migrate qsgen2 (.blog, .qst) files to qsgen3 Markdown format.") parser.add_argument("--source-dir", required=True, help="Directory containing old .blog and .qst files.") parser.add_argument("--output-dir", required=True, help="Directory to save new Markdown files (e.g., your qsgen3 'content' directory).") args = parser.parse_args() source_path = Path(args.source_dir) output_path = Path(args.output_dir) if not source_path.is_dir(): print(f"Error: Source directory '{source_path}' not found or not a directory.") return output_path.mkdir(parents=True, exist_ok=True) print(f"Source directory: {source_path.resolve()}") print(f"Output directory: {output_path.resolve()}") for item in source_path.rglob('*'): # rglob to find in subdirectories too, if any if item.is_file(): if item.name.endswith(".blog"): process_blog_file(item, output_path) elif item.name.endswith(".qst"): process_qst_file(item, output_path) print("\nMigration complete.") if __name__ == "__main__": main()