add wp2hugo script

2024-09-08 17:01:43 +01:00 · 2024-09-08 17:01:43 +01:00 · 42a1da0caa
parent d59575f8b2
commit 42a1da0caa
3 changed files with 259 additions and 1 deletions
--- a/bstools/poetry.lock
+++ b/bstools/poetry.lock
@ -118,6 +118,16 @@ files = [
 beautifulsoup4 = ">=4.9,<5"
 six = ">=1.15,<2"

+[[package]]
+name = "phpserialize"
+version = "1.3"
+description = "a port of the serialize and unserialize functions of php to python."
+optional = false
+python-versions = "*"
+files = [
+    {file = "phpserialize-1.3.tar.gz", hash = "sha256:bf672d312d203d09a84c26366fab8f438a3ffb355c407e69974b7ef2d39a0fa7"},
+]
+
 [[package]]
 name = "python-dotenv"
 version = "0.19.2"
@ -346,4 +356,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.7"
-content-hash = "1672b483488a4907061160b05790c138fb3e199ace6a2b5374a4512a76a49c2a"
+content-hash = "0385a6569e499e440a9b5f195b4e14d524bcddf01a7f5a415fa017f0aaee6cb0"
--- a/bstools/pyproject.toml
+++ b/bstools/pyproject.toml
@ -17,6 +17,7 @@ python-dotenv = "^0.19.2"
 markdownify = "^0.11.6"
 python-frontmatter = "^1.0.0"
 beautifulsoup4 = "^4.12.3"
+phpserialize = "^1.3"

 [tool.poetry.dev-dependencies]

--- a/bstools/wp2hugo.py
+++ b/bstools/wp2hugo.py
@ -0,0 +1,247 @@
+from multiprocessing import process
+from typing import List
+import click
+import xml.etree.ElementTree as ET
+from html import unescape
+from datetime import datetime
+import os
+import re
+import yaml
+import requests
+import os
+from urllib.parse import urlparse
+import uuid
+import phpserialize
+
+
+def generate_unique_filename(original_filename):
+    name, ext = os.path.splitext(original_filename)
+    return f"{name}_{uuid.uuid4().hex[:8]}{ext}"
+
+
+def download_image(url, output_dir):
+    response = requests.get(url)
+    if response.status_code == 200:
+        filename = os.path.basename(urlparse(url).path)
+        filepath = os.path.join(output_dir, filename)
+        with open(filepath, 'wb') as f:
+            f.write(response.content)
+        return filename
+    return None
+
+
+def sanitize_filename(filename):
+    return re.sub(r'[^\w\-_\. ]', '_', filename)
+
+
+TYPE_MAP = {
+    'indieblocks_note': 'note',
+    'indieblocks_like': 'like',
+}
+
+WHITELIST_TYPES = ['post', 'note', 'like']
+
+
+def process_mf2_photo(postmeta, namespaces):
+    mf2_photo = None
+    meta_key = postmeta.find('wp:meta_key', namespaces).text
+    if meta_key == 'mf2_photo':
+        meta_value = postmeta.find('wp:meta_value', namespaces).text
+        try:
+            # Remove CDATA wrapper if present
+            if meta_value.startswith('<![CDATA[') and meta_value.endswith(']]>'):
+                meta_value = meta_value[9:-3]
+
+            # Decode the serialized PHP data
+            decoded_value = phpserialize.loads(meta_value.encode('utf-8'))
+
+            print(decoded_value)
+
+            # Convert bytes to strings if necessary
+            if isinstance(decoded_value, dict):
+                mf2_photo = {k.decode('utf-8') if isinstance(k, bytes) else k:
+                             v.decode('utf-8') if isinstance(v, bytes) else v
+                             for k, v in decoded_value.items()}
+
+                if len(mf2_photo) > 0:
+                    mf2_photo = mf2_photo.values()
+
+            elif isinstance(decoded_value, list):
+                mf2_photo = [v.decode('utf-8') if isinstance(v, bytes) else v
+                             for v in decoded_value]
+            else:
+                mf2_photo = decoded_value
+        except Exception as e:
+            print(f"Warning: Unable to parse mf2_photo: {str(e)}")
+    return mf2_photo
+
+
+def process_mf2_download(mf2_photos, output_dir) -> List[str]:
+
+    photos = []
+
+    for mf2_photo in mf2_photos:
+        if not mf2_photo:
+            continue
+
+        media_dir = os.path.join(output_dir, 'media')
+        os.makedirs(media_dir, exist_ok=True)
+
+        try:
+            original_filename = download_image(mf2_photo, media_dir)
+            if original_filename:
+                new_filename = generate_unique_filename(original_filename)
+                old_path = os.path.join(media_dir, original_filename)
+                new_path = os.path.join(media_dir, new_filename)
+                os.rename(old_path, new_path)
+
+                new_url = f'/media/{new_filename}'
+                print(f"Downloaded and updated mf2_photo: {new_filename}")
+                photos.append(new_url)
+        except Exception as e:
+            print(f"Error downloading mf2_photo {mf2_photo}: {str(e)}")
+
+    return photos  # Return original URL if download failed
+
+
+@click.command()
+@click.option('--input', '-i', required=True, help='Input WordPress XML file')
+@click.option('--output-dir', '-o', default='output', help='Output directory for Markdown files')
+@click.option('--date-cutoff', '-d', type=click.DateTime(formats=["%Y-%m-%d"]),
+              help='Date cutoff in YYYY-MM-DD format. Articles before this date will be skipped.')
+def parse_wordpress_xml(input, output_dir, date_cutoff):
+    tree = ET.parse(input)
+    root = tree.getroot()
+
+    namespaces = {
+        'content': 'http://purl.org/rss/1.0/modules/content/',
+        'wp': 'http://wordpress.org/export/1.2/',
+        'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
+    }
+
+    os.makedirs(output_dir, exist_ok=True)
+    for item in root.findall('.//item', namespaces):
+        post_date = item.find('wp:post_date', namespaces).text
+        post_datetime = datetime.strptime(post_date, "%Y-%m-%d %H:%M:%S")
+
+        if date_cutoff and post_datetime < date_cutoff:
+            continue  # Skip articles before the cutoff date
+
+        title = item.find('title').text
+        post_type = item.find('wp:post_type', namespaces).text
+        post_id = item.find('wp:post_id', namespaces).text
+        status = item.find('wp:status', namespaces).text
+
+        # Extract mf2_photo
+        mf2_photo = None
+        for postmeta in item.findall('wp:postmeta', namespaces):
+            mf2_photo = process_mf2_photo(postmeta, namespaces)
+            if mf2_photo:
+                break
+
+        if mf2_photo:
+            print(f"Post ID: {post_id}, mf2_photo: {mf2_photo}")
+
+        if post_type in TYPE_MAP:
+            post_type = TYPE_MAP[post_type]
+
+        if post_type not in WHITELIST_TYPES:
+            # print(f"Skipping {post_type} post with ID {post_id}")
+            continue
+
+        content = item.find('content:encoded', namespaces)
+        if content is not None and content.text:
+            content_text = unescape(content.text)
+
+            # Create media directory if it doesn't exist
+            media_dir = os.path.join(output_dir, 'media')
+            os.makedirs(media_dir, exist_ok=True)
+
+            # Find all image URLs in the content
+            img_urls = re.findall(r'<img[^>]+src="([^">]+)"', content_text)
+
+            for img_url in img_urls:
+                # Download the image
+                try:
+                    original_filename = download_image(img_url, media_dir)
+                    if original_filename:
+                        # Generate a unique filename with a pseudorandom suffix
+                        new_filename = generate_unique_filename(
+                            original_filename)
+                        old_path = os.path.join(media_dir, original_filename)
+                        new_path = os.path.join(media_dir, new_filename)
+                        os.rename(old_path, new_path)
+
+                        # Update the image URL in the content
+                        new_url = f'/media/{new_filename}'
+                        content_text = content_text.replace(img_url, new_url)
+                        print(f"Downloaded and updated image: {new_filename}")
+                except Exception as e:
+                    print(f"Error downloading image {img_url}:")
+
+            content = content_text
+        else:
+            content = ''
+
+        excerpt = item.find('excerpt:encoded', namespaces)
+        excerpt = unescape(
+            excerpt.text) if excerpt is not None and excerpt.text else ''
+
+        categories = [cat.text for cat in item.findall(
+            'category[@domain="category"]')]
+        tags = [tag.text for tag in item.findall(
+            'category[@domain="post_tag"]')]
+
+        # Prepare frontmatter
+        frontmatter = {
+            'title': title,
+            'date': post_date,
+            'draft': status != 'publish',
+            'categories': categories,
+            'tags': tags,
+            'type': post_type + 's'
+        }
+
+        # Add this after the existing mf2_photo print statement
+        if mf2_photo:
+            mf2_urls = process_mf2_download(mf2_photo, output_dir)
+            frontmatter['photo'] = [{'url': url} for url in mf2_urls]
+
+        # Create folder structure and filename
+        year = post_datetime.strftime('%Y')
+        month = post_datetime.strftime('%m')
+        day = post_datetime.strftime('%d')
+        if title:
+            post_name = sanitize_filename(title)
+        else:
+            # If no title, use datestamp and random suffix
+            datestamp = post_datetime.strftime('%Y%m%d')
+            random_suffix = uuid.uuid4().hex[:8]
+            post_name = f"{datestamp}_{random_suffix}"
+
+        # Create folder structure with post type before date
+        folder_path = os.path.join(
+            output_dir, post_type + 's', year, month, day)
+        os.makedirs(folder_path, exist_ok=True)
+
+        filename = f"{post_name}.md"
+        filepath = os.path.join(folder_path, filename)
+
+        # No need for separate type_dir creation as it's now part of the main folder structure
+
+        with open(filepath, 'w', encoding='utf-8') as f:
+            # Write YAML frontmatter
+            f.write('---\n')
+            yaml.dump(frontmatter, f, default_flow_style=False)
+            f.write('---\n\n')
+
+            # Write content
+            if excerpt:
+                f.write(f"{excerpt}\n\n<!--more-->\n\n")
+            f.write(content)
+
+        print(f"Created file: {filepath}")
+
+
+if __name__ == '__main__':
+    parse_wordpress_xml()