Adds logic to migrate_qs2_to_qs3.py to convert: - #showimg to Markdown image syntax . - #linkimg to Markdown linked image syntax [](src). - #ytvideo to HTML iframe embed for YouTube videos. Image path processing handles absolute URLs and prepends /images/ for relative paths, aligning with previous qsgen2 behavior but omitting hardcoded styling attributes for Markdown output.
289 lines
12 KiB
Python
289 lines
12 KiB
Python
import re
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
# Tags that will be kept as HTML as they don't have direct Markdown equivalents
|
|
HTML_PASSTHROUGH_TAGS = {
|
|
"#DV": "<div>", "#EDV": "</div>",
|
|
"#SPN": "<span>", "#ESPN": "</span>",
|
|
"#TBL": "<table>", "#ETBL": "</table>",
|
|
"#TR": "<tr>", "#ETR": "</tr>",
|
|
"#TD": "<td>", "#ETD": "</td>",
|
|
"#TH": "<th>", "#ETH": "</th>",
|
|
"#ART": "<article>", "#EART": "</article>",
|
|
"#SEC": "<section>", "#ESEC": "</section>",
|
|
"#ASIDE": "<aside>", "#EASIDE": "</aside>",
|
|
"#NAV": "<nav>", "#ENAV": "</nav>",
|
|
"#BTN": "<button>", "#EBTN": "</button>",
|
|
"#SEL": "<select>", "#ESEL": "</select>",
|
|
"#OPT": "<option>", "#EOPT": "</option>",
|
|
}
|
|
|
|
def sanitize_filename(name):
|
|
"""Sanitizes a string to be a valid filename."""
|
|
name = name.lower()
|
|
name = re.sub(r'\s+', '-', name) # Replace spaces with hyphens
|
|
name = re.sub(r'[^a-z0-9\-_.]', '', name) # Remove unwanted characters
|
|
name = re.sub(r'-+', '-', name) # Replace multiple hyphens with single
|
|
name = name.strip('-_')
|
|
return name if name else "untitled"
|
|
|
|
def convert_qstags_to_markdown(content):
|
|
"""Converts qstags in content to Markdown syntax."""
|
|
|
|
# Start with a copy to modify
|
|
md_content = content
|
|
|
|
# Links: #link URL¤TEXT¤ -> [TEXT](URL)
|
|
md_content = re.sub(r'#link\s+([^¤]+)¤([^¤]+)¤', r'[\2](\1)', md_content)
|
|
|
|
# Headings: #H1...#EH1 -> # ..., etc.
|
|
for i in range(6, 0, -1):
|
|
# Regex to capture content between #Hi and #EHi, case insensitive, dotall for newlines
|
|
# Makes #EHi optional if it's at the end of a section or file.
|
|
md_content = re.sub(r"#H{i}(.*?)(?:#EH{i}|$)".format(i=i),
|
|
r"{} \1".format("#"*i),
|
|
md_content, flags=re.IGNORECASE | re.DOTALL)
|
|
# Clean up potential multiple newlines left by DOTALL capture if content was multi-line
|
|
md_content = re.sub(r"({} .*?)\n\n".format("#"*i), r"\1\n", md_content)
|
|
|
|
# Blockquotes: #Q...#EQ -> > ...
|
|
# This is a simplified approach. For multi-line blockquotes, each line needs '>' prefix.
|
|
# We'll capture the content and then process it line by line.
|
|
def replace_blockquote(match):
|
|
inner_content = match.group(1).strip()
|
|
lines = inner_content.split('\n')
|
|
return '\n'.join([f"> {line}" for line in lines]) + '\n'
|
|
md_content = re.sub(r"#Q(.*?)(?:#EQ|$)", replace_blockquote, md_content, flags=re.IGNORECASE | re.DOTALL)
|
|
|
|
# Ordered Lists: #OL ... #LI ... #ELI ... #EOL
|
|
def replace_ordered_list(match_ol):
|
|
ol_content = match_ol.group(1).strip()
|
|
list_item_texts = re.findall(r"^[ \t]*#LI[ \t]*(.*?)[ \t]*(?:#ELI)?\s*$", ol_content, flags=re.IGNORECASE | re.MULTILINE)
|
|
processed_items = []
|
|
for i, item_text in enumerate(list_item_texts):
|
|
processed_items.append(f"{i + 1}. {item_text.strip()}")
|
|
return "\n".join(processed_items) + ("\n" if processed_items else "")
|
|
|
|
md_content = re.sub(r"^[ \t]*#OL[ \t]*\n(.*?)\n^[ \t]*#EOL[ \t]*$", replace_ordered_list, md_content, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE)
|
|
|
|
# Unordered Lists: #UL ... #LI ... #ELI ... #EUL
|
|
def replace_unordered_list(match_ul):
|
|
ul_content = match_ul.group(1).strip()
|
|
list_item_texts = re.findall(r"^[ \t]*#LI[ \t]*(.*?)[ \t]*(?:#ELI)?\s*$", ul_content, flags=re.IGNORECASE | re.MULTILINE)
|
|
processed_items = []
|
|
for item_text in list_item_texts:
|
|
processed_items.append(f"- {item_text.strip()}")
|
|
return "\n".join(processed_items) + ("\n" if processed_items else "")
|
|
|
|
md_content = re.sub(r"^[ \t]*#UL[ \t]*\n(.*?)\n^[ \t]*#EUL[ \t]*$", replace_unordered_list, md_content, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE)
|
|
|
|
# Remove any stray #ELI tags if they weren't consumed by the #LI regex (unlikely but for cleanup)
|
|
md_content = re.sub(r"^[ \t]*#ELI[ \t]*$", "", md_content, flags=re.IGNORECASE | re.MULTILINE)
|
|
|
|
# Paragraphs: Remove #P, replace #EP with a newline to help separate blocks.
|
|
# Markdown relies on blank lines between paragraphs.
|
|
md_content = re.sub(r"#P\s*", "", md_content, flags=re.IGNORECASE)
|
|
md_content = re.sub(r"\s*#EP", "\n", md_content, flags=re.IGNORECASE)
|
|
|
|
# Inline elements
|
|
md_content = re.sub(r"#BD(.*?)#EBD", r"**\1**", md_content, flags=re.IGNORECASE)
|
|
md_content = re.sub(r"#STRONG(.*?)#ESTRONG", r"**\1**", md_content, flags=re.IGNORECASE)
|
|
md_content = re.sub(r"#I(.*?)#EI", r"*\1*", md_content, flags=re.IGNORECASE)
|
|
md_content = re.sub(r"#EM(.*?)#SEM", r"*\1*", md_content, flags=re.IGNORECASE) # Assuming #SEM is end tag for emphasis
|
|
md_content = re.sub(r"#C(.*?)#EC", r"`\1`", md_content, flags=re.IGNORECASE)
|
|
md_content = re.sub(r"#UD(.*?)#EUD", r"\1", md_content, flags=re.IGNORECASE) # Markdown has no underline, strip tags
|
|
|
|
# Images: #showimg IMAGE_PATH¤ALT_TEXT¤ -> 
|
|
def process_image_path_for_markdown(raw_path):
|
|
if raw_path.startswith(('http://', 'https://', '/')):
|
|
return raw_path
|
|
else:
|
|
return f"/images/{raw_path}"
|
|
|
|
def replace_showimg_to_markdown(match):
|
|
raw_path = match.group(1)
|
|
alt_text = match.group(2)
|
|
processed_path = process_image_path_for_markdown(raw_path)
|
|
return f""
|
|
md_content = re.sub(r'#showimg\s+([^¤]+)¤([^¤]+)¤', replace_showimg_to_markdown, md_content)
|
|
|
|
# Linked Images: #linkimg IMAGE_PATH¤ALT_TEXT¤ -> [](PROCESSED_IMAGE_PATH)
|
|
def replace_linkimg_to_markdown(match):
|
|
raw_path = match.group(1)
|
|
alt_text = match.group(2)
|
|
processed_path = process_image_path_for_markdown(raw_path) # Reusing the same path processor
|
|
return f"[]({processed_path})"
|
|
md_content = re.sub(r'#linkimg\s+([^¤]+)¤([^¤]+)¤', replace_linkimg_to_markdown, md_content)
|
|
|
|
# YouTube Videos: #ytvideo YOUTUBE_ID -> HTML iframe
|
|
def replace_ytvideo_to_html(match):
|
|
video_id = match.group(1)
|
|
return f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{video_id}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>'
|
|
md_content = re.sub(r'#ytvideo\s+([A-Za-z0-9_\-]+)', replace_ytvideo_to_html, md_content)
|
|
|
|
# Line break: #BR -> two spaces + newline
|
|
md_content = md_content.replace("#BR", " \n")
|
|
|
|
# HTML Entities (these are fine as is, Markdown supports them)
|
|
md_content = md_content.replace("#LT", "<")
|
|
md_content = md_content.replace("#GT", ">")
|
|
md_content = md_content.replace("#NUM", "#")
|
|
|
|
# Passthrough HTML for tags without direct Markdown equivalents
|
|
for qstag, html_tag in HTML_PASSTHROUGH_TAGS.items():
|
|
md_content = md_content.replace(qstag, html_tag)
|
|
|
|
# Final cleanup:
|
|
# Normalize multiple blank lines to a single blank line (Markdown standard for paragraph separation)
|
|
md_content = re.sub(r"\n\s*\n", "\n\n", md_content)
|
|
# Remove leading/trailing whitespace from the whole content
|
|
md_content = md_content.strip()
|
|
|
|
return md_content
|
|
|
|
def process_blog_file(file_path, output_dir_base):
|
|
"""Processes a .blog file and creates a new Markdown file."""
|
|
print(f"Processing blog file: {file_path}")
|
|
content_lines = file_path.read_text().splitlines()
|
|
|
|
metadata = {
|
|
"title": "Untitled Post",
|
|
"date": "",
|
|
"layout": "post",
|
|
"author": "Anonymous"
|
|
}
|
|
body_content = []
|
|
|
|
# Extract date from filename (e.g., 20250530-3.blog)
|
|
match_date_filename = re.match(r'(\d{8})-\d+\.blog', file_path.name)
|
|
if match_date_filename:
|
|
date_str = match_date_filename.group(1)
|
|
metadata['date'] = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"
|
|
else:
|
|
print(f" [WARN] Could not parse date from filename: {file_path.name}. Skipping date.")
|
|
|
|
parsing_ingress = False
|
|
parsing_body = False
|
|
|
|
for line in content_lines:
|
|
if line.startswith("DATE "):
|
|
# DATE field in file is secondary to filename date for posts
|
|
pass
|
|
elif line.startswith("BLOG_TITLE "):
|
|
metadata['title'] = line.replace("BLOG_TITLE ", "", 1).strip()
|
|
elif line.strip() == "#INGRESS_START":
|
|
parsing_ingress = True
|
|
continue
|
|
elif line.strip() == "#INGRESS_STOP":
|
|
parsing_ingress = False
|
|
continue
|
|
elif line.strip() == "#BODY_START":
|
|
parsing_body = True
|
|
continue
|
|
elif line.strip() == "#BODY_STOP":
|
|
parsing_body = False
|
|
continue
|
|
|
|
if parsing_ingress or parsing_body:
|
|
body_content.append(line)
|
|
|
|
markdown_body = convert_qstags_to_markdown("\n".join(body_content))
|
|
|
|
escaped_title = metadata['title'].replace('"', '\\"') # Escape for YAML
|
|
frontmatter = [
|
|
"---",
|
|
f'title: "{escaped_title}"', # Use single quotes for f-string, double for YAML value
|
|
f"date: {metadata['date']}",
|
|
f"layout: {metadata['layout']}",
|
|
f"author: {metadata['author']}",
|
|
"---",
|
|
""
|
|
]
|
|
|
|
output_content = "\n".join(frontmatter) + markdown_body
|
|
|
|
sanitized_title = sanitize_filename(metadata['title'])
|
|
if not metadata['date']:
|
|
# Fallback if date couldn't be parsed, though unlikely for .blog files
|
|
output_subdir = Path(output_dir_base) / "blog" / "unknown_date"
|
|
else:
|
|
year, month, day = metadata['date'].split('-')
|
|
output_subdir = Path(output_dir_base) / "blog" / year / month / day
|
|
|
|
output_subdir.mkdir(parents=True, exist_ok=True)
|
|
output_file_path = output_subdir / f"{sanitized_title}.md"
|
|
|
|
output_file_path.write_text(output_content)
|
|
print(f" -> Created: {output_file_path}")
|
|
|
|
def process_qst_file(file_path, output_dir_base):
|
|
"""Processes a .qst file and creates a new Markdown file."""
|
|
print(f"Processing page file: {file_path}")
|
|
content_lines = file_path.read_text().splitlines()
|
|
|
|
metadata = {
|
|
"title": "Untitled Page",
|
|
"layout": "page",
|
|
"author": "Anonymous" # Added for consistency
|
|
}
|
|
body_content_lines = []
|
|
|
|
if content_lines and content_lines[0].startswith("#title="):
|
|
metadata['title'] = content_lines[0].replace("#title=", "", 1).strip()
|
|
body_content_lines = content_lines[1:]
|
|
else:
|
|
print(f" [WARN] No #title= found in {file_path.name}. Using filename as title.")
|
|
metadata['title'] = file_path.stem
|
|
body_content_lines = content_lines
|
|
|
|
markdown_body = convert_qstags_to_markdown("\n".join(body_content_lines))
|
|
|
|
escaped_title = metadata['title'].replace('"', '\\"') # Escape for YAML
|
|
frontmatter = [
|
|
"---",
|
|
f'title: "{escaped_title}"', # Use single quotes for f-string, double for YAML value
|
|
f"layout: {metadata['layout']}",
|
|
f"author: {metadata['author']}",
|
|
"---",
|
|
""
|
|
]
|
|
|
|
output_content = "\n".join(frontmatter) + markdown_body
|
|
|
|
sanitized_title = sanitize_filename(metadata['title'])
|
|
# Pages go into the root of the output_dir_base (e.g. content/)
|
|
output_file_path = Path(output_dir_base) / f"{sanitized_title}.md"
|
|
|
|
output_file_path.write_text(output_content)
|
|
print(f" -> Created: {output_file_path}")
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Migrate qsgen2 (.blog, .qst) files to qsgen3 Markdown format.")
|
|
parser.add_argument("--source-dir", required=True, help="Directory containing old .blog and .qst files.")
|
|
parser.add_argument("--output-dir", required=True, help="Directory to save new Markdown files (e.g., your qsgen3 'content' directory).")
|
|
args = parser.parse_args()
|
|
|
|
source_path = Path(args.source_dir)
|
|
output_path = Path(args.output_dir)
|
|
|
|
if not source_path.is_dir():
|
|
print(f"Error: Source directory '{source_path}' not found or not a directory.")
|
|
return
|
|
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
print(f"Source directory: {source_path.resolve()}")
|
|
print(f"Output directory: {output_path.resolve()}")
|
|
|
|
for item in source_path.rglob('*'): # rglob to find in subdirectories too, if any
|
|
if item.is_file():
|
|
if item.name.endswith(".blog"):
|
|
process_blog_file(item, output_path)
|
|
elif item.name.endswith(".qst"):
|
|
process_qst_file(item, output_path)
|
|
|
|
print("\nMigration complete.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|