From 42a1da0caa8ba964775e5a29658e6af9fd1343c0 Mon Sep 17 00:00:00 2001 From: James Ravenscroft Date: Sun, 8 Sep 2024 17:01:43 +0100 Subject: [PATCH] add wp2hugo script --- bstools/poetry.lock | 12 +- bstools/pyproject.toml | 1 + bstools/wp2hugo.py | 247 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 259 insertions(+), 1 deletion(-) create mode 100644 bstools/wp2hugo.py diff --git a/bstools/poetry.lock b/bstools/poetry.lock index cdc0b34..77dd53b 100644 --- a/bstools/poetry.lock +++ b/bstools/poetry.lock @@ -118,6 +118,16 @@ files = [ beautifulsoup4 = ">=4.9,<5" six = ">=1.15,<2" +[[package]] +name = "phpserialize" +version = "1.3" +description = "a port of the serialize and unserialize functions of php to python." +optional = false +python-versions = "*" +files = [ + {file = "phpserialize-1.3.tar.gz", hash = "sha256:bf672d312d203d09a84c26366fab8f438a3ffb355c407e69974b7ef2d39a0fa7"}, +] + [[package]] name = "python-dotenv" version = "0.19.2" @@ -346,4 +356,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "2.0" python-versions = "^3.7" -content-hash = "1672b483488a4907061160b05790c138fb3e199ace6a2b5374a4512a76a49c2a" +content-hash = "0385a6569e499e440a9b5f195b4e14d524bcddf01a7f5a415fa017f0aaee6cb0" diff --git a/bstools/pyproject.toml b/bstools/pyproject.toml index d95c743..ff99dfb 100644 --- a/bstools/pyproject.toml +++ b/bstools/pyproject.toml @@ -17,6 +17,7 @@ python-dotenv = "^0.19.2" markdownify = "^0.11.6" python-frontmatter = "^1.0.0" beautifulsoup4 = "^4.12.3" +phpserialize = "^1.3" [tool.poetry.dev-dependencies] diff --git a/bstools/wp2hugo.py b/bstools/wp2hugo.py new file mode 100644 index 0000000..e9cc12c --- /dev/null +++ b/bstools/wp2hugo.py @@ -0,0 +1,247 @@ +from multiprocessing import process +from typing import List +import click +import xml.etree.ElementTree as ET +from html import unescape +from datetime import datetime +import os +import re +import yaml +import requests +import os +from urllib.parse import urlparse +import uuid +import phpserialize + + +def generate_unique_filename(original_filename): + name, ext = os.path.splitext(original_filename) + return f"{name}_{uuid.uuid4().hex[:8]}{ext}" + + +def download_image(url, output_dir): + response = requests.get(url) + if response.status_code == 200: + filename = os.path.basename(urlparse(url).path) + filepath = os.path.join(output_dir, filename) + with open(filepath, 'wb') as f: + f.write(response.content) + return filename + return None + + +def sanitize_filename(filename): + return re.sub(r'[^\w\-_\. ]', '_', filename) + + +TYPE_MAP = { + 'indieblocks_note': 'note', + 'indieblocks_like': 'like', +} + +WHITELIST_TYPES = ['post', 'note', 'like'] + + +def process_mf2_photo(postmeta, namespaces): + mf2_photo = None + meta_key = postmeta.find('wp:meta_key', namespaces).text + if meta_key == 'mf2_photo': + meta_value = postmeta.find('wp:meta_value', namespaces).text + try: + # Remove CDATA wrapper if present + if meta_value.startswith(''): + meta_value = meta_value[9:-3] + + # Decode the serialized PHP data + decoded_value = phpserialize.loads(meta_value.encode('utf-8')) + + print(decoded_value) + + # Convert bytes to strings if necessary + if isinstance(decoded_value, dict): + mf2_photo = {k.decode('utf-8') if isinstance(k, bytes) else k: + v.decode('utf-8') if isinstance(v, bytes) else v + for k, v in decoded_value.items()} + + if len(mf2_photo) > 0: + mf2_photo = mf2_photo.values() + + elif isinstance(decoded_value, list): + mf2_photo = [v.decode('utf-8') if isinstance(v, bytes) else v + for v in decoded_value] + else: + mf2_photo = decoded_value + except Exception as e: + print(f"Warning: Unable to parse mf2_photo: {str(e)}") + return mf2_photo + + +def process_mf2_download(mf2_photos, output_dir) -> List[str]: + + photos = [] + + for mf2_photo in mf2_photos: + if not mf2_photo: + continue + + media_dir = os.path.join(output_dir, 'media') + os.makedirs(media_dir, exist_ok=True) + + try: + original_filename = download_image(mf2_photo, media_dir) + if original_filename: + new_filename = generate_unique_filename(original_filename) + old_path = os.path.join(media_dir, original_filename) + new_path = os.path.join(media_dir, new_filename) + os.rename(old_path, new_path) + + new_url = f'/media/{new_filename}' + print(f"Downloaded and updated mf2_photo: {new_filename}") + photos.append(new_url) + except Exception as e: + print(f"Error downloading mf2_photo {mf2_photo}: {str(e)}") + + return photos # Return original URL if download failed + + +@click.command() +@click.option('--input', '-i', required=True, help='Input WordPress XML file') +@click.option('--output-dir', '-o', default='output', help='Output directory for Markdown files') +@click.option('--date-cutoff', '-d', type=click.DateTime(formats=["%Y-%m-%d"]), + help='Date cutoff in YYYY-MM-DD format. Articles before this date will be skipped.') +def parse_wordpress_xml(input, output_dir, date_cutoff): + tree = ET.parse(input) + root = tree.getroot() + + namespaces = { + 'content': 'http://purl.org/rss/1.0/modules/content/', + 'wp': 'http://wordpress.org/export/1.2/', + 'excerpt': 'http://wordpress.org/export/1.2/excerpt/', + } + + os.makedirs(output_dir, exist_ok=True) + for item in root.findall('.//item', namespaces): + post_date = item.find('wp:post_date', namespaces).text + post_datetime = datetime.strptime(post_date, "%Y-%m-%d %H:%M:%S") + + if date_cutoff and post_datetime < date_cutoff: + continue # Skip articles before the cutoff date + + title = item.find('title').text + post_type = item.find('wp:post_type', namespaces).text + post_id = item.find('wp:post_id', namespaces).text + status = item.find('wp:status', namespaces).text + + # Extract mf2_photo + mf2_photo = None + for postmeta in item.findall('wp:postmeta', namespaces): + mf2_photo = process_mf2_photo(postmeta, namespaces) + if mf2_photo: + break + + if mf2_photo: + print(f"Post ID: {post_id}, mf2_photo: {mf2_photo}") + + if post_type in TYPE_MAP: + post_type = TYPE_MAP[post_type] + + if post_type not in WHITELIST_TYPES: + # print(f"Skipping {post_type} post with ID {post_id}") + continue + + content = item.find('content:encoded', namespaces) + if content is not None and content.text: + content_text = unescape(content.text) + + # Create media directory if it doesn't exist + media_dir = os.path.join(output_dir, 'media') + os.makedirs(media_dir, exist_ok=True) + + # Find all image URLs in the content + img_urls = re.findall(r']+src="([^">]+)"', content_text) + + for img_url in img_urls: + # Download the image + try: + original_filename = download_image(img_url, media_dir) + if original_filename: + # Generate a unique filename with a pseudorandom suffix + new_filename = generate_unique_filename( + original_filename) + old_path = os.path.join(media_dir, original_filename) + new_path = os.path.join(media_dir, new_filename) + os.rename(old_path, new_path) + + # Update the image URL in the content + new_url = f'/media/{new_filename}' + content_text = content_text.replace(img_url, new_url) + print(f"Downloaded and updated image: {new_filename}") + except Exception as e: + print(f"Error downloading image {img_url}:") + + content = content_text + else: + content = '' + + excerpt = item.find('excerpt:encoded', namespaces) + excerpt = unescape( + excerpt.text) if excerpt is not None and excerpt.text else '' + + categories = [cat.text for cat in item.findall( + 'category[@domain="category"]')] + tags = [tag.text for tag in item.findall( + 'category[@domain="post_tag"]')] + + # Prepare frontmatter + frontmatter = { + 'title': title, + 'date': post_date, + 'draft': status != 'publish', + 'categories': categories, + 'tags': tags, + 'type': post_type + 's' + } + + # Add this after the existing mf2_photo print statement + if mf2_photo: + mf2_urls = process_mf2_download(mf2_photo, output_dir) + frontmatter['photo'] = [{'url': url} for url in mf2_urls] + + # Create folder structure and filename + year = post_datetime.strftime('%Y') + month = post_datetime.strftime('%m') + day = post_datetime.strftime('%d') + if title: + post_name = sanitize_filename(title) + else: + # If no title, use datestamp and random suffix + datestamp = post_datetime.strftime('%Y%m%d') + random_suffix = uuid.uuid4().hex[:8] + post_name = f"{datestamp}_{random_suffix}" + + # Create folder structure with post type before date + folder_path = os.path.join( + output_dir, post_type + 's', year, month, day) + os.makedirs(folder_path, exist_ok=True) + + filename = f"{post_name}.md" + filepath = os.path.join(folder_path, filename) + + # No need for separate type_dir creation as it's now part of the main folder structure + + with open(filepath, 'w', encoding='utf-8') as f: + # Write YAML frontmatter + f.write('---\n') + yaml.dump(frontmatter, f, default_flow_style=False) + f.write('---\n\n') + + # Write content + if excerpt: + f.write(f"{excerpt}\n\n\n\n") + f.write(content) + + print(f"Created file: {filepath}") + + +if __name__ == '__main__': + parse_wordpress_xml()