feat(qsgen3): Enhance output naming, URL structure, and sitemap

Implements several improvements to the qsgen3 site generator:

- **Filename Sanitization**: Introduces robust sanitization for output HTML filenames, converting to lowercase, replacing special characters with hyphens, and ensuring clean names.
- **Blog Post URL Structure**: Blog posts from content/posts/ are now generated with a blog/YYYY/MM/DD/ URL structure if a valid date is present in frontmatter. Defaults to blog/filename.html if no valid date.
- **Improved Title Fallback**: Uses the original filename (before sanitization) as a fallback title if not specified in frontmatter.
- **Enhanced Pandoc Error Handling**: Better logging for Pandoc execution and explicit check of its exit code, allowing the script to continue on single file errors.
- **Sitemap Integration**: Adds successfully generated pages to sitemap URL list.
- **Non-fatal Sitemap Generation**: Sitemap generation failures are now logged as warnings and do not halt the script.

Also, unignores the /scripts/ directory in .gitignore to include migration scripts in the repository.
This commit is contained in:
Stig-Ørjan Smelror 2025-05-31 01:15:18 +02:00
parent 1283eb30cb
commit 4b95426256
2 changed files with 160 additions and 20 deletions

1
.gitignore vendored
View File

@ -22,7 +22,6 @@ include/qsgen2/lang/*.en
output/
# Scripts directory (temporary/conversion scripts)
/scripts/
/tools/
# Build output

View File

@ -17,6 +17,7 @@ umask 0022
# --- Configuration ---
# Associative array to hold configuration values
typeset -A QSG_CONFIG
typeset -a SITEMAP_URLS=() # Array to store URLs for sitemap
# --- Script Paths ---
@ -282,6 +283,64 @@ _check_dependencies() {
_log INFO "All critical dependencies found."
}
# --- Sitemap Generation ---
_generate_sitemap() {
_log DEBUG "Entered _generate_sitemap"
if [[ "${QSG_CONFIG[build_options_generate_sitemap]}" != "true" ]]; then
_log INFO "Sitemap generation is disabled in configuration. Skipping."
return 0
fi
if [[ -z "${QSG_CONFIG[site_url]}" ]]; then
_log WARNING "'site_url' is not set in configuration. Cannot generate sitemap."
return 1 # Indicate an issue
fi
if [[ ${#SITEMAP_URLS[@]} -eq 0 ]]; then
_log INFO "No URLs collected for sitemap. Skipping sitemap generation."
return 0
fi
_log INFO "Generating sitemap..."
local sitemap_file="${QSG_CONFIG[paths_output_dir]}/sitemap.xml"
local site_url="${QSG_CONFIG[site_url]}"
# Ensure site_url does not end with a slash for clean concatenation
site_url="${site_url%/}"
local last_mod_date
last_mod_date=$(date +%Y-%m-%d) # Current date as last modified
# Start XML structure
# Using a subshell for cleaner output redirection to the file
(
echo '<?xml version="1.0" encoding="UTF-8"?>'
echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
local rel_url
for rel_url in "${SITEMAP_URLS[@]}"; do
# Ensure rel_url doesn't start with a slash if site_url already provides it
# However, our collected URLs (e.g., posts/file.html, index.html) are fine.
echo ' <url>'
echo " <loc>${site_url}/${rel_url}</loc>"
echo " <lastmod>${last_mod_date}</lastmod>"
# Optional: <changefreq> and <priority>
# echo " <changefreq>weekly</changefreq>"
# echo " <priority>0.8</priority>"
echo ' </url>'
done
echo '</urlset>'
) > "$sitemap_file"
if [[ $? -eq 0 ]]; then
_log SUCCESS "Sitemap generated successfully: $sitemap_file"
else
_log ERROR "Failed to write sitemap to $sitemap_file"
return 1 # Indicate an issue
fi
return 0
}
# --- Core Functions ---
_clean_output_dir() {
_log INFO "Cleaning output directory: ${QSG_CONFIG[paths_output_dir]}"
@ -632,25 +691,54 @@ _process_markdown_files() {
if [[ -z "$source_file" ]]; then continue; fi
_log DEBUG "Processing Markdown file: $source_file"
local relative_path="${source_file#$content_dir/}"
local relative_path_from_content_root="${source_file#$content_dir/}"
if [[ "$content_dir" == "$source_file" ]]; then
relative_path=$(basename "$source_file")
relative_path_from_content_root=$(basename "$source_file")
elif [[ "$content_dir" == "/" && "$source_file" == /* ]]; then
relative_path="${source_file#/}"
relative_path_from_content_root="${source_file#/}"
elif [[ "$content_dir" == "." && "$source_file" == ./* ]]; then
relative_path="${source_file#./}"
relative_path_from_content_root="${source_file#./}"
fi
local output_file_html_part="${relative_path%.md}.html"
local output_file_abs="$output_dir/$output_file_html_part"
local dir_part=""
local filename_md_part=""
mkdir -p "$(dirname "$output_file_abs")"
if [[ "$relative_path_from_content_root" == */* ]]; then
dir_part="${relative_path_from_content_root%/*}/"
filename_md_part="${relative_path_from_content_root##*/}"
else
dir_part=""
filename_md_part="$relative_path_from_content_root"
fi
local filename_base_orig="${filename_md_part%.md}"
# Sanitize filename_base_orig
local sanitized_filename_base="${filename_base_orig:l}" # Lowercase
sanitized_filename_base=${sanitized_filename_base//[[:space:],;\']/'-'} # Replace spaces, commas, semicolons, apostrophes
# Consolidate multiple hyphens into one
sanitized_filename_base=$(echo "$sanitized_filename_base" | tr -s '-')
# Remove leading hyphens
while [[ "$sanitized_filename_base" == -* ]]; do
sanitized_filename_base="${sanitized_filename_base#-}"
done
# Remove trailing hyphens
while [[ "$sanitized_filename_base" == *- ]]; do
sanitized_filename_base="${sanitized_filename_base%-}"
done
if [[ -z "$sanitized_filename_base" ]]; then
_log WARNING "Original filename base '$filename_base_orig' from '$source_file' resulted in empty sanitized name. Using 'untitled' as fallback."
sanitized_filename_base="untitled"
fi
local frontmatter=$(sed -n '/^---$/,/^---$/{/^---$/d;p;}' "$source_file")
local title=$(echo "$frontmatter" | grep -m1 -iE '^title:' | sed -E 's/^title:[[:space:]]*//i; s/^["\x27](.*)["\x27]$/\1/')
local date=$(echo "$frontmatter" | grep -m1 -iE '^date:' | sed -E 's/^date:[[:space:]]*//i; s/^["\x27](.*)["\x27]$/\1/')
local title=$(echo "$frontmatter" | grep -m1 -iE '^title:' | sed -E 's/^title:[[:space:]]*//i; s/^[\"\x27](.*)[\"\x27]$/\1/')
local date=$(echo "$frontmatter" | grep -m1 -iE '^date:' | sed -E 's/^date:[[:space:]]*//i; s/^[\"\x27](.*)[\"\x27]$/\1/')
local draft=$(echo "$frontmatter" | grep -m1 -iE '^draft:' | sed -E 's/^draft:[[:space:]]*//i' | tr '[:upper:]' '[:lower:]')
local custom_layout_fm=$(echo "$frontmatter" | grep -m1 -iE '^layout:' | sed -E 's/^layout:[[:space:]]*//i; s/^["\x27](.*)["\x27]$/\1/')
local custom_layout_fm=$(echo "$frontmatter" | grep -m1 -iE '^layout:' | sed -E 's/^layout:[[:space:]]*//i; s/^[\"\x27](.*)[\"\x27]$/\1/')
if [[ "$draft" == "true" && "${QSG_CONFIG[build_options_process_drafts]}" != "true" ]]; then
_log DEBUG "Skipping draft file: $source_file"
@ -658,11 +746,49 @@ _process_markdown_files() {
fi
if [[ -z "$title" ]]; then
_log WARNING "Markdown file '$source_file' is missing a title. Using filename as fallback."
local fn_no_ext=$(basename "$source_file")
title="${fn_no_ext%.md}"
_log WARNING "Markdown file '$source_file' is missing a title. Using filename_base_orig ('$filename_base_orig') as fallback."
title="$filename_base_orig"
fi
# Determine output URL directory structure
local output_url_dir_part=""
# The variable 'relative_path_from_content_root' holds the path relative to 'content_dir' (e.g., posts/my-post.md)
# The variable 'date' is extracted from frontmatter earlier
# The variable 'dir_part' holds the original source directory (e.g., "posts/" or "pages/")
if [[ "$relative_path_from_content_root" == posts/* ]]; then # Check if it's a blog post from content/posts/
if [[ -n "$date" ]]; then # Date was extracted earlier from frontmatter
# Assuming date is YYYY-MM-DD
local parsed_year=$(echo "$date" | awk -F'-' '{print $1}')
local parsed_month=$(echo "$date" | awk -F'-' '{print $2}')
local parsed_day=$(echo "$date" | awk -F'-' '{print $3}')
local valid_date_parts=true
if ! [[ "$parsed_year" =~ ^[0-9]{4}$ ]]; then valid_date_parts=false; fi
if ! [[ "$parsed_month" =~ ^(0[1-9]|1[0-2])$ ]]; then valid_date_parts=false; fi # Validates MM is 01-12
if ! [[ "$parsed_day" =~ ^(0[1-9]|[12][0-9]|3[01])$ ]]; then valid_date_parts=false; fi # Validates DD is 01-31
if [[ "$valid_date_parts" == true ]]; then
output_url_dir_part="blog/$parsed_year/$parsed_month/$parsed_day/"
else
_log WARNING "Blog post '$source_file' (date: '$date') has invalid date components. Required: YYYY-MM-DD. URL will be 'blog/${sanitized_filename_base}.html'."
output_url_dir_part="blog/"
fi
else
_log WARNING "Blog post '$source_file' is missing a date. URL will be 'blog/${sanitized_filename_base}.html'."
output_url_dir_part="blog/"
fi
else # It's a regular page, not a blog post from content/posts/
output_url_dir_part="$dir_part" # Use its original directory structure (e.g., "pages/" or "" for root files)
fi
local output_file_html_part="${output_url_dir_part}${sanitized_filename_base}.html"
local output_file_abs="$output_dir/$output_file_html_part"
_log DEBUG "Source: $source_file -> Output URL Path: $output_file_html_part (Original base: '$filename_base_orig', Sanitized: '$sanitized_filename_base')"
mkdir -p "$(dirname "$output_file_abs")"
local template_to_use=""
if [[ -n "$custom_layout_fm" ]]; then
if [[ "$custom_layout_fm" != *.html && "$custom_layout_fm" != *.xml ]]; then custom_layout_fm+=".html"; fi
@ -674,7 +800,7 @@ _process_markdown_files() {
fi
if [[ -z "$template_to_use" ]]; then
if [[ "$relative_path" == posts/* && -f "$default_post_template" ]]; then
if [[ "$relative_path_from_content_root" == posts/* && -f "$default_post_template" ]]; then
template_to_use="$default_post_template"
elif [[ -f "$default_page_template" ]]; then
template_to_use="$default_page_template"
@ -718,13 +844,25 @@ _process_markdown_files() {
else
_log DEBUG "No CSS specified for post $source_file."
fi
_log INFO "Generating $output_file_abs from $source_file using template $template_to_use"
if ! "${pandoc_cmd[@]}"; then
_log ERROR "Pandoc failed for $source_file. Command was: ${pandoc_cmd[*]}"
_log DEBUG "Executing Pandoc for $source_file. Command constructed with individual arguments (see array definition)."
"${pandoc_cmd[@]}"
local pandoc_exit_code=$?
if [[ $pandoc_exit_code -eq 0 ]]; then
_log DEBUG "Successfully processed $source_file to $output_file_abs"
# Add to sitemap URLs
if [[ "${QSG_CONFIG[build_options_generate_sitemap]}" == "true" ]]; then
SITEMAP_URLS+=("$output_file_html_part")
_log DEBUG "Added to sitemap URLs: $output_file_html_part"
fi
else
_log DEBUG "Successfully generated $output_file_abs"
_log ERROR "Pandoc failed for $source_file (exit code: $pandoc_exit_code). Output: $output_file_abs"
# Optionally, decide if one failure should stop all, or just skip this file
# For now, we continue processing other files
continue
fi
done < <(find "$content_dir" -type f -name '*.md' -print0 | xargs -0 -r realpath)
_log INFO "Finished processing Markdown files."
@ -1094,6 +1232,9 @@ main() {
_generate_rss_feed
if [[ $? -ne 0 ]]; then _log ERROR "RSS feed generation failed."; exit 1; fi
_generate_sitemap
if [[ $? -ne 0 ]]; then _log WARNING "Sitemap generation encountered issues."; fi # Non-fatal, just a warning
_log INFO "Final state of output directory (${QSG_CONFIG[paths_output_dir]}):
$(ls -R "${QSG_CONFIG[paths_output_dir]}" 2>&1)"
_log SUCCESS "Site generation complete! Output: ${QSG_CONFIG[paths_output_dir]}"