feat(migration): Add #showimg, #linkimg, #ytvideo conversion

Adds logic to migrate_qs2_to_qs3.py to convert:
- #showimg to Markdown image syntax ![alt](src).
- #linkimg to Markdown linked image syntax [![alt](src)](src).
- #ytvideo to HTML iframe embed for YouTube videos.

Image path processing handles absolute URLs and prepends /images/
for relative paths, aligning with previous qsgen2 behavior but
omitting hardcoded styling attributes for Markdown output.
This commit is contained in:
Stig-Ørjan Smelror 2025-05-31 01:13:55 +02:00
parent f5d6d0eb49
commit 1283eb30cb

View File

@ -0,0 +1,288 @@
import re
import argparse
from pathlib import Path
# Tags that will be kept as HTML as they don't have direct Markdown equivalents
HTML_PASSTHROUGH_TAGS = {
"#DV": "<div>", "#EDV": "</div>",
"#SPN": "<span>", "#ESPN": "</span>",
"#TBL": "<table>", "#ETBL": "</table>",
"#TR": "<tr>", "#ETR": "</tr>",
"#TD": "<td>", "#ETD": "</td>",
"#TH": "<th>", "#ETH": "</th>",
"#ART": "<article>", "#EART": "</article>",
"#SEC": "<section>", "#ESEC": "</section>",
"#ASIDE": "<aside>", "#EASIDE": "</aside>",
"#NAV": "<nav>", "#ENAV": "</nav>",
"#BTN": "<button>", "#EBTN": "</button>",
"#SEL": "<select>", "#ESEL": "</select>",
"#OPT": "<option>", "#EOPT": "</option>",
}
def sanitize_filename(name):
"""Sanitizes a string to be a valid filename."""
name = name.lower()
name = re.sub(r'\s+', '-', name) # Replace spaces with hyphens
name = re.sub(r'[^a-z0-9\-_.]', '', name) # Remove unwanted characters
name = re.sub(r'-+', '-', name) # Replace multiple hyphens with single
name = name.strip('-_')
return name if name else "untitled"
def convert_qstags_to_markdown(content):
"""Converts qstags in content to Markdown syntax."""
# Start with a copy to modify
md_content = content
# Links: #link URL¤TEXT¤ -> [TEXT](URL)
md_content = re.sub(r'#link\s+([^¤]+)¤([^¤]+)¤', r'[\2](\1)', md_content)
# Headings: #H1...#EH1 -> # ..., etc.
for i in range(6, 0, -1):
# Regex to capture content between #Hi and #EHi, case insensitive, dotall for newlines
# Makes #EHi optional if it's at the end of a section or file.
md_content = re.sub(r"#H{i}(.*?)(?:#EH{i}|$)".format(i=i),
r"{} \1".format("#"*i),
md_content, flags=re.IGNORECASE | re.DOTALL)
# Clean up potential multiple newlines left by DOTALL capture if content was multi-line
md_content = re.sub(r"({} .*?)\n\n".format("#"*i), r"\1\n", md_content)
# Blockquotes: #Q...#EQ -> > ...
# This is a simplified approach. For multi-line blockquotes, each line needs '>' prefix.
# We'll capture the content and then process it line by line.
def replace_blockquote(match):
inner_content = match.group(1).strip()
lines = inner_content.split('\n')
return '\n'.join([f"> {line}" for line in lines]) + '\n'
md_content = re.sub(r"#Q(.*?)(?:#EQ|$)", replace_blockquote, md_content, flags=re.IGNORECASE | re.DOTALL)
# Ordered Lists: #OL ... #LI ... #ELI ... #EOL
def replace_ordered_list(match_ol):
ol_content = match_ol.group(1).strip()
list_item_texts = re.findall(r"^[ \t]*#LI[ \t]*(.*?)[ \t]*(?:#ELI)?\s*$", ol_content, flags=re.IGNORECASE | re.MULTILINE)
processed_items = []
for i, item_text in enumerate(list_item_texts):
processed_items.append(f"{i + 1}. {item_text.strip()}")
return "\n".join(processed_items) + ("\n" if processed_items else "")
md_content = re.sub(r"^[ \t]*#OL[ \t]*\n(.*?)\n^[ \t]*#EOL[ \t]*$", replace_ordered_list, md_content, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE)
# Unordered Lists: #UL ... #LI ... #ELI ... #EUL
def replace_unordered_list(match_ul):
ul_content = match_ul.group(1).strip()
list_item_texts = re.findall(r"^[ \t]*#LI[ \t]*(.*?)[ \t]*(?:#ELI)?\s*$", ul_content, flags=re.IGNORECASE | re.MULTILINE)
processed_items = []
for item_text in list_item_texts:
processed_items.append(f"- {item_text.strip()}")
return "\n".join(processed_items) + ("\n" if processed_items else "")
md_content = re.sub(r"^[ \t]*#UL[ \t]*\n(.*?)\n^[ \t]*#EUL[ \t]*$", replace_unordered_list, md_content, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE)
# Remove any stray #ELI tags if they weren't consumed by the #LI regex (unlikely but for cleanup)
md_content = re.sub(r"^[ \t]*#ELI[ \t]*$", "", md_content, flags=re.IGNORECASE | re.MULTILINE)
# Paragraphs: Remove #P, replace #EP with a newline to help separate blocks.
# Markdown relies on blank lines between paragraphs.
md_content = re.sub(r"#P\s*", "", md_content, flags=re.IGNORECASE)
md_content = re.sub(r"\s*#EP", "\n", md_content, flags=re.IGNORECASE)
# Inline elements
md_content = re.sub(r"#BD(.*?)#EBD", r"**\1**", md_content, flags=re.IGNORECASE)
md_content = re.sub(r"#STRONG(.*?)#ESTRONG", r"**\1**", md_content, flags=re.IGNORECASE)
md_content = re.sub(r"#I(.*?)#EI", r"*\1*", md_content, flags=re.IGNORECASE)
md_content = re.sub(r"#EM(.*?)#SEM", r"*\1*", md_content, flags=re.IGNORECASE) # Assuming #SEM is end tag for emphasis
md_content = re.sub(r"#C(.*?)#EC", r"`\1`", md_content, flags=re.IGNORECASE)
md_content = re.sub(r"#UD(.*?)#EUD", r"\1", md_content, flags=re.IGNORECASE) # Markdown has no underline, strip tags
# Images: #showimg IMAGE_PATH¤ALT_TEXT¤ -> ![ALT_TEXT](PROCESSED_IMAGE_PATH)
def process_image_path_for_markdown(raw_path):
if raw_path.startswith(('http://', 'https://', '/')):
return raw_path
else:
return f"/images/{raw_path}"
def replace_showimg_to_markdown(match):
raw_path = match.group(1)
alt_text = match.group(2)
processed_path = process_image_path_for_markdown(raw_path)
return f"![{alt_text}]({processed_path})"
md_content = re.sub(r'#showimg\s+([^¤]+)¤([^¤]+)¤', replace_showimg_to_markdown, md_content)
# Linked Images: #linkimg IMAGE_PATH¤ALT_TEXT¤ -> [![ALT_TEXT](PROCESSED_IMAGE_PATH)](PROCESSED_IMAGE_PATH)
def replace_linkimg_to_markdown(match):
raw_path = match.group(1)
alt_text = match.group(2)
processed_path = process_image_path_for_markdown(raw_path) # Reusing the same path processor
return f"[![{alt_text}]({processed_path})]({processed_path})"
md_content = re.sub(r'#linkimg\s+([^¤]+)¤([^¤]+)¤', replace_linkimg_to_markdown, md_content)
# YouTube Videos: #ytvideo YOUTUBE_ID -> HTML iframe
def replace_ytvideo_to_html(match):
video_id = match.group(1)
return f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{video_id}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>'
md_content = re.sub(r'#ytvideo\s+([A-Za-z0-9_\-]+)', replace_ytvideo_to_html, md_content)
# Line break: #BR -> two spaces + newline
md_content = md_content.replace("#BR", " \n")
# HTML Entities (these are fine as is, Markdown supports them)
md_content = md_content.replace("#LT", "&lt;")
md_content = md_content.replace("#GT", "&gt;")
md_content = md_content.replace("#NUM", "&num;")
# Passthrough HTML for tags without direct Markdown equivalents
for qstag, html_tag in HTML_PASSTHROUGH_TAGS.items():
md_content = md_content.replace(qstag, html_tag)
# Final cleanup:
# Normalize multiple blank lines to a single blank line (Markdown standard for paragraph separation)
md_content = re.sub(r"\n\s*\n", "\n\n", md_content)
# Remove leading/trailing whitespace from the whole content
md_content = md_content.strip()
return md_content
def process_blog_file(file_path, output_dir_base):
"""Processes a .blog file and creates a new Markdown file."""
print(f"Processing blog file: {file_path}")
content_lines = file_path.read_text().splitlines()
metadata = {
"title": "Untitled Post",
"date": "",
"layout": "post",
"author": "Anonymous"
}
body_content = []
# Extract date from filename (e.g., 20250530-3.blog)
match_date_filename = re.match(r'(\d{8})-\d+\.blog', file_path.name)
if match_date_filename:
date_str = match_date_filename.group(1)
metadata['date'] = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"
else:
print(f" [WARN] Could not parse date from filename: {file_path.name}. Skipping date.")
parsing_ingress = False
parsing_body = False
for line in content_lines:
if line.startswith("DATE "):
# DATE field in file is secondary to filename date for posts
pass
elif line.startswith("BLOG_TITLE "):
metadata['title'] = line.replace("BLOG_TITLE ", "", 1).strip()
elif line.strip() == "#INGRESS_START":
parsing_ingress = True
continue
elif line.strip() == "#INGRESS_STOP":
parsing_ingress = False
continue
elif line.strip() == "#BODY_START":
parsing_body = True
continue
elif line.strip() == "#BODY_STOP":
parsing_body = False
continue
if parsing_ingress or parsing_body:
body_content.append(line)
markdown_body = convert_qstags_to_markdown("\n".join(body_content))
escaped_title = metadata['title'].replace('"', '\\"') # Escape for YAML
frontmatter = [
"---",
f'title: "{escaped_title}"', # Use single quotes for f-string, double for YAML value
f"date: {metadata['date']}",
f"layout: {metadata['layout']}",
f"author: {metadata['author']}",
"---",
""
]
output_content = "\n".join(frontmatter) + markdown_body
sanitized_title = sanitize_filename(metadata['title'])
if not metadata['date']:
# Fallback if date couldn't be parsed, though unlikely for .blog files
output_subdir = Path(output_dir_base) / "blog" / "unknown_date"
else:
year, month, day = metadata['date'].split('-')
output_subdir = Path(output_dir_base) / "blog" / year / month / day
output_subdir.mkdir(parents=True, exist_ok=True)
output_file_path = output_subdir / f"{sanitized_title}.md"
output_file_path.write_text(output_content)
print(f" -> Created: {output_file_path}")
def process_qst_file(file_path, output_dir_base):
"""Processes a .qst file and creates a new Markdown file."""
print(f"Processing page file: {file_path}")
content_lines = file_path.read_text().splitlines()
metadata = {
"title": "Untitled Page",
"layout": "page",
"author": "Anonymous" # Added for consistency
}
body_content_lines = []
if content_lines and content_lines[0].startswith("#title="):
metadata['title'] = content_lines[0].replace("#title=", "", 1).strip()
body_content_lines = content_lines[1:]
else:
print(f" [WARN] No #title= found in {file_path.name}. Using filename as title.")
metadata['title'] = file_path.stem
body_content_lines = content_lines
markdown_body = convert_qstags_to_markdown("\n".join(body_content_lines))
escaped_title = metadata['title'].replace('"', '\\"') # Escape for YAML
frontmatter = [
"---",
f'title: "{escaped_title}"', # Use single quotes for f-string, double for YAML value
f"layout: {metadata['layout']}",
f"author: {metadata['author']}",
"---",
""
]
output_content = "\n".join(frontmatter) + markdown_body
sanitized_title = sanitize_filename(metadata['title'])
# Pages go into the root of the output_dir_base (e.g. content/)
output_file_path = Path(output_dir_base) / f"{sanitized_title}.md"
output_file_path.write_text(output_content)
print(f" -> Created: {output_file_path}")
def main():
parser = argparse.ArgumentParser(description="Migrate qsgen2 (.blog, .qst) files to qsgen3 Markdown format.")
parser.add_argument("--source-dir", required=True, help="Directory containing old .blog and .qst files.")
parser.add_argument("--output-dir", required=True, help="Directory to save new Markdown files (e.g., your qsgen3 'content' directory).")
args = parser.parse_args()
source_path = Path(args.source_dir)
output_path = Path(args.output_dir)
if not source_path.is_dir():
print(f"Error: Source directory '{source_path}' not found or not a directory.")
return
output_path.mkdir(parents=True, exist_ok=True)
print(f"Source directory: {source_path.resolve()}")
print(f"Output directory: {output_path.resolve()}")
for item in source_path.rglob('*'): # rglob to find in subdirectories too, if any
if item.is_file():
if item.name.endswith(".blog"):
process_blog_file(item, output_path)
elif item.name.endswith(".qst"):
process_qst_file(item, output_path)
print("\nMigration complete.")
if __name__ == "__main__":
main()