diff --git a/scripts/migrate_qs2_to_qs3.py b/scripts/migrate_qs2_to_qs3.py new file mode 100644 index 0000000..ffa5d29 --- /dev/null +++ b/scripts/migrate_qs2_to_qs3.py @@ -0,0 +1,288 @@ +import re +import argparse +from pathlib import Path + +# Tags that will be kept as HTML as they don't have direct Markdown equivalents +HTML_PASSTHROUGH_TAGS = { + "#DV": "
", "#EDV": "
", + "#SPN": "", "#ESPN": "", + "#TBL": "", "#ETBL": "
", + "#TR": "", "#ETR": "", + "#TD": "", "#ETD": "", + "#TH": "", "#ETH": "", + "#ART": "
", "#EART": "
", + "#SEC": "
", "#ESEC": "
", + "#ASIDE": "", + "#NAV": "", + "#BTN": "", + "#SEL": "", + "#OPT": "", +} + +def sanitize_filename(name): + """Sanitizes a string to be a valid filename.""" + name = name.lower() + name = re.sub(r'\s+', '-', name) # Replace spaces with hyphens + name = re.sub(r'[^a-z0-9\-_.]', '', name) # Remove unwanted characters + name = re.sub(r'-+', '-', name) # Replace multiple hyphens with single + name = name.strip('-_') + return name if name else "untitled" + +def convert_qstags_to_markdown(content): + """Converts qstags in content to Markdown syntax.""" + + # Start with a copy to modify + md_content = content + + # Links: #link URL¤TEXT¤ -> [TEXT](URL) + md_content = re.sub(r'#link\s+([^¤]+)¤([^¤]+)¤', r'[\2](\1)', md_content) + + # Headings: #H1...#EH1 -> # ..., etc. + for i in range(6, 0, -1): + # Regex to capture content between #Hi and #EHi, case insensitive, dotall for newlines + # Makes #EHi optional if it's at the end of a section or file. + md_content = re.sub(r"#H{i}(.*?)(?:#EH{i}|$)".format(i=i), + r"{} \1".format("#"*i), + md_content, flags=re.IGNORECASE | re.DOTALL) + # Clean up potential multiple newlines left by DOTALL capture if content was multi-line + md_content = re.sub(r"({} .*?)\n\n".format("#"*i), r"\1\n", md_content) + + # Blockquotes: #Q...#EQ -> > ... + # This is a simplified approach. For multi-line blockquotes, each line needs '>' prefix. + # We'll capture the content and then process it line by line. + def replace_blockquote(match): + inner_content = match.group(1).strip() + lines = inner_content.split('\n') + return '\n'.join([f"> {line}" for line in lines]) + '\n' + md_content = re.sub(r"#Q(.*?)(?:#EQ|$)", replace_blockquote, md_content, flags=re.IGNORECASE | re.DOTALL) + + # Ordered Lists: #OL ... #LI ... #ELI ... #EOL + def replace_ordered_list(match_ol): + ol_content = match_ol.group(1).strip() + list_item_texts = re.findall(r"^[ \t]*#LI[ \t]*(.*?)[ \t]*(?:#ELI)?\s*$", ol_content, flags=re.IGNORECASE | re.MULTILINE) + processed_items = [] + for i, item_text in enumerate(list_item_texts): + processed_items.append(f"{i + 1}. {item_text.strip()}") + return "\n".join(processed_items) + ("\n" if processed_items else "") + + md_content = re.sub(r"^[ \t]*#OL[ \t]*\n(.*?)\n^[ \t]*#EOL[ \t]*$", replace_ordered_list, md_content, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE) + + # Unordered Lists: #UL ... #LI ... #ELI ... #EUL + def replace_unordered_list(match_ul): + ul_content = match_ul.group(1).strip() + list_item_texts = re.findall(r"^[ \t]*#LI[ \t]*(.*?)[ \t]*(?:#ELI)?\s*$", ul_content, flags=re.IGNORECASE | re.MULTILINE) + processed_items = [] + for item_text in list_item_texts: + processed_items.append(f"- {item_text.strip()}") + return "\n".join(processed_items) + ("\n" if processed_items else "") + + md_content = re.sub(r"^[ \t]*#UL[ \t]*\n(.*?)\n^[ \t]*#EUL[ \t]*$", replace_unordered_list, md_content, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE) + + # Remove any stray #ELI tags if they weren't consumed by the #LI regex (unlikely but for cleanup) + md_content = re.sub(r"^[ \t]*#ELI[ \t]*$", "", md_content, flags=re.IGNORECASE | re.MULTILINE) + + # Paragraphs: Remove #P, replace #EP with a newline to help separate blocks. + # Markdown relies on blank lines between paragraphs. + md_content = re.sub(r"#P\s*", "", md_content, flags=re.IGNORECASE) + md_content = re.sub(r"\s*#EP", "\n", md_content, flags=re.IGNORECASE) + + # Inline elements + md_content = re.sub(r"#BD(.*?)#EBD", r"**\1**", md_content, flags=re.IGNORECASE) + md_content = re.sub(r"#STRONG(.*?)#ESTRONG", r"**\1**", md_content, flags=re.IGNORECASE) + md_content = re.sub(r"#I(.*?)#EI", r"*\1*", md_content, flags=re.IGNORECASE) + md_content = re.sub(r"#EM(.*?)#SEM", r"*\1*", md_content, flags=re.IGNORECASE) # Assuming #SEM is end tag for emphasis + md_content = re.sub(r"#C(.*?)#EC", r"`\1`", md_content, flags=re.IGNORECASE) + md_content = re.sub(r"#UD(.*?)#EUD", r"\1", md_content, flags=re.IGNORECASE) # Markdown has no underline, strip tags + + # Images: #showimg IMAGE_PATH¤ALT_TEXT¤ -> ![ALT_TEXT](PROCESSED_IMAGE_PATH) + def process_image_path_for_markdown(raw_path): + if raw_path.startswith(('http://', 'https://', '/')): + return raw_path + else: + return f"/images/{raw_path}" + + def replace_showimg_to_markdown(match): + raw_path = match.group(1) + alt_text = match.group(2) + processed_path = process_image_path_for_markdown(raw_path) + return f"![{alt_text}]({processed_path})" + md_content = re.sub(r'#showimg\s+([^¤]+)¤([^¤]+)¤', replace_showimg_to_markdown, md_content) + + # Linked Images: #linkimg IMAGE_PATH¤ALT_TEXT¤ -> [![ALT_TEXT](PROCESSED_IMAGE_PATH)](PROCESSED_IMAGE_PATH) + def replace_linkimg_to_markdown(match): + raw_path = match.group(1) + alt_text = match.group(2) + processed_path = process_image_path_for_markdown(raw_path) # Reusing the same path processor + return f"[![{alt_text}]({processed_path})]({processed_path})" + md_content = re.sub(r'#linkimg\s+([^¤]+)¤([^¤]+)¤', replace_linkimg_to_markdown, md_content) + + # YouTube Videos: #ytvideo YOUTUBE_ID -> HTML iframe + def replace_ytvideo_to_html(match): + video_id = match.group(1) + return f'' + md_content = re.sub(r'#ytvideo\s+([A-Za-z0-9_\-]+)', replace_ytvideo_to_html, md_content) + + # Line break: #BR -> two spaces + newline + md_content = md_content.replace("#BR", " \n") + + # HTML Entities (these are fine as is, Markdown supports them) + md_content = md_content.replace("#LT", "<") + md_content = md_content.replace("#GT", ">") + md_content = md_content.replace("#NUM", "#") + + # Passthrough HTML for tags without direct Markdown equivalents + for qstag, html_tag in HTML_PASSTHROUGH_TAGS.items(): + md_content = md_content.replace(qstag, html_tag) + + # Final cleanup: + # Normalize multiple blank lines to a single blank line (Markdown standard for paragraph separation) + md_content = re.sub(r"\n\s*\n", "\n\n", md_content) + # Remove leading/trailing whitespace from the whole content + md_content = md_content.strip() + + return md_content + +def process_blog_file(file_path, output_dir_base): + """Processes a .blog file and creates a new Markdown file.""" + print(f"Processing blog file: {file_path}") + content_lines = file_path.read_text().splitlines() + + metadata = { + "title": "Untitled Post", + "date": "", + "layout": "post", + "author": "Anonymous" + } + body_content = [] + + # Extract date from filename (e.g., 20250530-3.blog) + match_date_filename = re.match(r'(\d{8})-\d+\.blog', file_path.name) + if match_date_filename: + date_str = match_date_filename.group(1) + metadata['date'] = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}" + else: + print(f" [WARN] Could not parse date from filename: {file_path.name}. Skipping date.") + + parsing_ingress = False + parsing_body = False + + for line in content_lines: + if line.startswith("DATE "): + # DATE field in file is secondary to filename date for posts + pass + elif line.startswith("BLOG_TITLE "): + metadata['title'] = line.replace("BLOG_TITLE ", "", 1).strip() + elif line.strip() == "#INGRESS_START": + parsing_ingress = True + continue + elif line.strip() == "#INGRESS_STOP": + parsing_ingress = False + continue + elif line.strip() == "#BODY_START": + parsing_body = True + continue + elif line.strip() == "#BODY_STOP": + parsing_body = False + continue + + if parsing_ingress or parsing_body: + body_content.append(line) + + markdown_body = convert_qstags_to_markdown("\n".join(body_content)) + + escaped_title = metadata['title'].replace('"', '\\"') # Escape for YAML + frontmatter = [ + "---", + f'title: "{escaped_title}"', # Use single quotes for f-string, double for YAML value + f"date: {metadata['date']}", + f"layout: {metadata['layout']}", + f"author: {metadata['author']}", + "---", + "" + ] + + output_content = "\n".join(frontmatter) + markdown_body + + sanitized_title = sanitize_filename(metadata['title']) + if not metadata['date']: + # Fallback if date couldn't be parsed, though unlikely for .blog files + output_subdir = Path(output_dir_base) / "blog" / "unknown_date" + else: + year, month, day = metadata['date'].split('-') + output_subdir = Path(output_dir_base) / "blog" / year / month / day + + output_subdir.mkdir(parents=True, exist_ok=True) + output_file_path = output_subdir / f"{sanitized_title}.md" + + output_file_path.write_text(output_content) + print(f" -> Created: {output_file_path}") + +def process_qst_file(file_path, output_dir_base): + """Processes a .qst file and creates a new Markdown file.""" + print(f"Processing page file: {file_path}") + content_lines = file_path.read_text().splitlines() + + metadata = { + "title": "Untitled Page", + "layout": "page", + "author": "Anonymous" # Added for consistency + } + body_content_lines = [] + + if content_lines and content_lines[0].startswith("#title="): + metadata['title'] = content_lines[0].replace("#title=", "", 1).strip() + body_content_lines = content_lines[1:] + else: + print(f" [WARN] No #title= found in {file_path.name}. Using filename as title.") + metadata['title'] = file_path.stem + body_content_lines = content_lines + + markdown_body = convert_qstags_to_markdown("\n".join(body_content_lines)) + + escaped_title = metadata['title'].replace('"', '\\"') # Escape for YAML + frontmatter = [ + "---", + f'title: "{escaped_title}"', # Use single quotes for f-string, double for YAML value + f"layout: {metadata['layout']}", + f"author: {metadata['author']}", + "---", + "" + ] + + output_content = "\n".join(frontmatter) + markdown_body + + sanitized_title = sanitize_filename(metadata['title']) + # Pages go into the root of the output_dir_base (e.g. content/) + output_file_path = Path(output_dir_base) / f"{sanitized_title}.md" + + output_file_path.write_text(output_content) + print(f" -> Created: {output_file_path}") + +def main(): + parser = argparse.ArgumentParser(description="Migrate qsgen2 (.blog, .qst) files to qsgen3 Markdown format.") + parser.add_argument("--source-dir", required=True, help="Directory containing old .blog and .qst files.") + parser.add_argument("--output-dir", required=True, help="Directory to save new Markdown files (e.g., your qsgen3 'content' directory).") + args = parser.parse_args() + + source_path = Path(args.source_dir) + output_path = Path(args.output_dir) + + if not source_path.is_dir(): + print(f"Error: Source directory '{source_path}' not found or not a directory.") + return + + output_path.mkdir(parents=True, exist_ok=True) + print(f"Source directory: {source_path.resolve()}") + print(f"Output directory: {output_path.resolve()}") + + for item in source_path.rglob('*'): # rglob to find in subdirectories too, if any + if item.is_file(): + if item.name.endswith(".blog"): + process_blog_file(item, output_path) + elif item.name.endswith(".qst"): + process_qst_file(item, output_path) + + print("\nMigration complete.") + +if __name__ == "__main__": + main()