brainsteam.co.uk/bstools/wp2hugo.py

from multiprocessing import process
from typing import List
import click
import xml.etree.ElementTree as ET
from html import unescape
from datetime import datetime
import os
import re
import yaml
import requests
import os
import re
from urllib.parse import urlparse
import uuid
import phpserialize


def generate_unique_filename(original_filename):
    name, ext = os.path.splitext(original_filename)
    return f"{name}_{uuid.uuid4().hex[:8]}{ext}"


def download_image(url, output_dir):
    response = requests.get(url)
    if response.status_code == 200:
        filename = os.path.basename(urlparse(url).path)
        filepath = os.path.join(output_dir, filename)
        with open(filepath, 'wb') as f:
            f.write(response.content)
        return filename
    return None


def sanitize_filename(filename):
    return re.sub(r'[^\w\-_\. ]', '_', filename)


TYPE_MAP = {
    'indieblocks_note': 'note',
    'indieblocks_like': 'like',
}

WHITELIST_TYPES = ['post']  # , 'note', 'like']


def process_mf2_photo(postmeta, namespaces):
    mf2_photo = None
    meta_key = postmeta.find('wp:meta_key', namespaces).text
    if meta_key == 'mf2_photo':
        meta_value = postmeta.find('wp:meta_value', namespaces).text
        try:
            # Remove CDATA wrapper if present
            if meta_value.startswith('<![CDATA[') and meta_value.endswith(']]>'):
                meta_value = meta_value[9:-3]

            # Decode the serialized PHP data
            decoded_value = phpserialize.loads(meta_value.encode('utf-8'))

            print(decoded_value)

            # Convert bytes to strings if necessary
            if isinstance(decoded_value, dict):
                mf2_photo = {k.decode('utf-8') if isinstance(k, bytes) else k:
                             v.decode('utf-8') if isinstance(v, bytes) else v
                             for k, v in decoded_value.items()}

                if len(mf2_photo) > 0:
                    mf2_photo = mf2_photo.values()

            elif isinstance(decoded_value, list):
                mf2_photo = [v.decode('utf-8') if isinstance(v, bytes) else v
                             for v in decoded_value]
            else:
                mf2_photo = decoded_value
        except Exception as e:
            print(f"Warning: Unable to parse mf2_photo: {str(e)}")
    return mf2_photo


def process_mf2_download(mf2_photos, output_dir) -> List[str]:

    photos = []

    for mf2_photo in mf2_photos:
        if not mf2_photo:
            continue

        media_dir = os.path.join(output_dir, 'media')
        os.makedirs(media_dir, exist_ok=True)

        try:
            original_filename = download_image(mf2_photo, media_dir)
            if original_filename:
                new_filename = generate_unique_filename(original_filename)
                old_path = os.path.join(media_dir, original_filename)
                new_path = os.path.join(media_dir, new_filename)
                os.rename(old_path, new_path)

                new_url = f'/media/{new_filename}'
                print(f"Downloaded and updated mf2_photo: {new_filename}")
                photos.append(new_url)
        except Exception as e:
            print(f"Error downloading mf2_photo {mf2_photo}: {str(e)}")

    return photos  # Return original URL if download failed


@click.command()
@click.option('--input', '-i', required=True, help='Input WordPress XML file')
@click.option('--output-dir', '-o', default='output', help='Output directory for Markdown files')
@click.option('--date-cutoff', '-d', type=click.DateTime(formats=["%Y-%m-%d"]),
              help='Date cutoff in YYYY-MM-DD format. Articles before this date will be skipped.')
def parse_wordpress_xml(input, output_dir, date_cutoff):
    tree = ET.parse(input)
    root = tree.getroot()

    namespaces = {
        'content': 'http://purl.org/rss/1.0/modules/content/',
        'wp': 'http://wordpress.org/export/1.2/',
        'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
    }

    os.makedirs(output_dir, exist_ok=True)
    for item in root.findall('.//item', namespaces):
        post_date = item.find('wp:post_date', namespaces).text
        post_datetime = datetime.strptime(post_date, "%Y-%m-%d %H:%M:%S")

        if date_cutoff and post_datetime < date_cutoff:
            continue  # Skip articles before the cutoff date

        title = item.find('title').text
        post_type = item.find('wp:post_type', namespaces).text
        post_id = item.find('wp:post_id', namespaces).text
        status = item.find('wp:status', namespaces).text

        # Extract full post URL
        post_url = item.find('link').text

        # Extract post slug from URL
        parsed_url = urlparse(post_url)
        post_slug = parsed_url.path

        # Extract mf2_photo
        mf2_photo = None
        for postmeta in item.findall('wp:postmeta', namespaces):
            mf2_photo = process_mf2_photo(postmeta, namespaces)
            if mf2_photo:
                break

        if mf2_photo:
            print(f"Post ID: {post_id}, mf2_photo: {mf2_photo}")

        if post_type in TYPE_MAP:
            post_type = TYPE_MAP[post_type]

        if post_type not in WHITELIST_TYPES:
            # print(f"Skipping {post_type} post with ID {post_id}")
            continue

        content = item.find('content:encoded', namespaces)
        if content is not None and content.text:
            content_text = unescape(content.text)

            # Create media directory if it doesn't exist
            media_dir = os.path.join(output_dir, 'media')
            os.makedirs(media_dir, exist_ok=True)

            # Find all image URLs in the content
            img_urls = re.findall(r'<img[^>]+src="([^">]+)"', content_text)

            for img_url in img_urls:
                # Download the image
                try:
                    original_filename = download_image(img_url, media_dir)
                    if original_filename:
                        # Generate a unique filename with a pseudorandom suffix
                        new_filename = generate_unique_filename(
                            original_filename)
                        old_path = os.path.join(media_dir, original_filename)
                        new_path = os.path.join(media_dir, new_filename)
                        os.rename(old_path, new_path)

                        # Update the image URL in the content
                        new_url = f'/media/{new_filename}'
                        content_text = content_text.replace(img_url, new_url)
                        print(f"Downloaded and updated image: {new_filename}")
                except Exception as e:
                    print(f"Error downloading image {img_url}:")

            content = content_text
        else:
            content = ''

        excerpt = item.find('excerpt:encoded', namespaces)
        excerpt = unescape(
            excerpt.text) if excerpt is not None and excerpt.text else ''

        categories = [cat.text for cat in item.findall(
            'category[@domain="category"]')]
        tags = [tag.text for tag in item.findall(
            'category[@domain="post_tag"]')]

        # Prepare frontmatter
        frontmatter = {
            'title': title,
            'date': post_date,
            'draft': status != 'publish',
            'categories': categories,
            'tags': tags,
            'type': post_type + 's',
            'url': post_slug
        }

        # Add this after the existing mf2_photo print statement
        if mf2_photo:
            mf2_urls = process_mf2_download(mf2_photo, output_dir)
            frontmatter['photo'] = [{'url': url} for url in mf2_urls]

        # Create folder structure and filename
        year = post_datetime.strftime('%Y')
        month = post_datetime.strftime('%m')
        day = post_datetime.strftime('%d')
        if title:
            post_name = sanitize_filename(title)
        else:
            # If no title, use datestamp and random suffix
            datestamp = post_datetime.strftime('%Y%m%d')
            random_suffix = uuid.uuid4().hex[:8]
            post_name = f"{datestamp}_{random_suffix}"

        # Create folder structure with post type before date
        folder_path = os.path.join(
            output_dir, post_type + 's', year, month, day)
        os.makedirs(folder_path, exist_ok=True)

        filename = f"{post_name}.md"
        filepath = os.path.join(folder_path, filename)

        # No need for separate type_dir creation as it's now part of the main folder structure

        with open(filepath, 'w', encoding='utf-8') as f:
            # Write YAML frontmatter
            f.write('---\n')
            yaml.dump(frontmatter, f, default_flow_style=False)
            f.write('---\n\n')

            # Write content
            if excerpt:
                f.write(f"{excerpt}\n\n<!--more-->\n\n")
            f.write(content)

        print(f"Created file: {filepath}")


if __name__ == '__main__':
    parse_wordpress_xml()
add wp2hugo script 2024-09-08 17:01:43 +01:00			`from multiprocessing import process`
			`from typing import List`
			`import click`
			`import xml.etree.ElementTree as ET`
			`from html import unescape`
			`from datetime import datetime`
			`import os`
			`import re`
			`import yaml`
			`import requests`
			`import os`
update card gen script 2024-09-09 12:57:25 +01:00			`import re`
add wp2hugo script 2024-09-08 17:01:43 +01:00			`from urllib.parse import urlparse`
			`import uuid`
			`import phpserialize`


			`def generate_unique_filename(original_filename):`
			`name, ext = os.path.splitext(original_filename)`
			`return f"{name}_{uuid.uuid4().hex[:8]}{ext}"`


			`def download_image(url, output_dir):`
			`response = requests.get(url)`
			`if response.status_code == 200:`
			`filename = os.path.basename(urlparse(url).path)`
			`filepath = os.path.join(output_dir, filename)`
			`with open(filepath, 'wb') as f:`
			`f.write(response.content)`
			`return filename`
			`return None`


			`def sanitize_filename(filename):`
			`return re.sub(r'[^\w\-_\. ]', '_', filename)`


			`TYPE_MAP = {`
			`'indieblocks_note': 'note',`
			`'indieblocks_like': 'like',`
			`}`

update card gen script 2024-09-09 12:57:25 +01:00			`WHITELIST_TYPES = ['post'] # , 'note', 'like']`
add wp2hugo script 2024-09-08 17:01:43 +01:00

			`def process_mf2_photo(postmeta, namespaces):`
			`mf2_photo = None`
			`meta_key = postmeta.find('wp:meta_key', namespaces).text`
			`if meta_key == 'mf2_photo':`
			`meta_value = postmeta.find('wp:meta_value', namespaces).text`
			`try:`
			`# Remove CDATA wrapper if present`
			`if meta_value.startswith('<![CDATA[') and meta_value.endswith(']]>'):`
			`meta_value = meta_value[9:-3]`

			`# Decode the serialized PHP data`
			`decoded_value = phpserialize.loads(meta_value.encode('utf-8'))`

			`print(decoded_value)`

			`# Convert bytes to strings if necessary`
			`if isinstance(decoded_value, dict):`
			`mf2_photo = {k.decode('utf-8') if isinstance(k, bytes) else k:`
			`v.decode('utf-8') if isinstance(v, bytes) else v`
			`for k, v in decoded_value.items()}`

			`if len(mf2_photo) > 0:`
			`mf2_photo = mf2_photo.values()`

			`elif isinstance(decoded_value, list):`
			`mf2_photo = [v.decode('utf-8') if isinstance(v, bytes) else v`
			`for v in decoded_value]`
			`else:`
			`mf2_photo = decoded_value`
			`except Exception as e:`
			`print(f"Warning: Unable to parse mf2_photo: {str(e)}")`
			`return mf2_photo`


			`def process_mf2_download(mf2_photos, output_dir) -> List[str]:`

			`photos = []`

			`for mf2_photo in mf2_photos:`
			`if not mf2_photo:`
			`continue`

			`media_dir = os.path.join(output_dir, 'media')`
			`os.makedirs(media_dir, exist_ok=True)`

			`try:`
			`original_filename = download_image(mf2_photo, media_dir)`
			`if original_filename:`
			`new_filename = generate_unique_filename(original_filename)`
			`old_path = os.path.join(media_dir, original_filename)`
			`new_path = os.path.join(media_dir, new_filename)`
			`os.rename(old_path, new_path)`

			`new_url = f'/media/{new_filename}'`
			`print(f"Downloaded and updated mf2_photo: {new_filename}")`
			`photos.append(new_url)`
			`except Exception as e:`
			`print(f"Error downloading mf2_photo {mf2_photo}: {str(e)}")`

			`return photos # Return original URL if download failed`


			`@click.command()`
			`@click.option('--input', '-i', required=True, help='Input WordPress XML file')`
			`@click.option('--output-dir', '-o', default='output', help='Output directory for Markdown files')`
			`@click.option('--date-cutoff', '-d', type=click.DateTime(formats=["%Y-%m-%d"]),`
			`help='Date cutoff in YYYY-MM-DD format. Articles before this date will be skipped.')`
			`def parse_wordpress_xml(input, output_dir, date_cutoff):`
			`tree = ET.parse(input)`
			`root = tree.getroot()`

			`namespaces = {`
			`'content': 'http://purl.org/rss/1.0/modules/content/',`
			`'wp': 'http://wordpress.org/export/1.2/',`
			`'excerpt': 'http://wordpress.org/export/1.2/excerpt/',`
			`}`

			`os.makedirs(output_dir, exist_ok=True)`
			`for item in root.findall('.//item', namespaces):`
			`post_date = item.find('wp:post_date', namespaces).text`
			`post_datetime = datetime.strptime(post_date, "%Y-%m-%d %H:%M:%S")`

			`if date_cutoff and post_datetime < date_cutoff:`
			`continue # Skip articles before the cutoff date`

			`title = item.find('title').text`
			`post_type = item.find('wp:post_type', namespaces).text`
			`post_id = item.find('wp:post_id', namespaces).text`
			`status = item.find('wp:status', namespaces).text`

update card gen script 2024-09-09 12:57:25 +01:00			`# Extract full post URL`
			`post_url = item.find('link').text`

			`# Extract post slug from URL`
			`parsed_url = urlparse(post_url)`
			`post_slug = parsed_url.path`

add wp2hugo script 2024-09-08 17:01:43 +01:00			`# Extract mf2_photo`
			`mf2_photo = None`
			`for postmeta in item.findall('wp:postmeta', namespaces):`
			`mf2_photo = process_mf2_photo(postmeta, namespaces)`
			`if mf2_photo:`
			`break`

			`if mf2_photo:`
			`print(f"Post ID: {post_id}, mf2_photo: {mf2_photo}")`

			`if post_type in TYPE_MAP:`
			`post_type = TYPE_MAP[post_type]`

			`if post_type not in WHITELIST_TYPES:`
			`# print(f"Skipping {post_type} post with ID {post_id}")`
			`continue`

			`content = item.find('content:encoded', namespaces)`
			`if content is not None and content.text:`
			`content_text = unescape(content.text)`

			`# Create media directory if it doesn't exist`
			`media_dir = os.path.join(output_dir, 'media')`
			`os.makedirs(media_dir, exist_ok=True)`

			`# Find all image URLs in the content`
			`img_urls = re.findall(r'<img[^>]+src="([^">]+)"', content_text)`

			`for img_url in img_urls:`
			`# Download the image`
			`try:`
			`original_filename = download_image(img_url, media_dir)`
			`if original_filename:`
			`# Generate a unique filename with a pseudorandom suffix`
			`new_filename = generate_unique_filename(`
			`original_filename)`
			`old_path = os.path.join(media_dir, original_filename)`
			`new_path = os.path.join(media_dir, new_filename)`
			`os.rename(old_path, new_path)`

			`# Update the image URL in the content`
			`new_url = f'/media/{new_filename}'`
			`content_text = content_text.replace(img_url, new_url)`
			`print(f"Downloaded and updated image: {new_filename}")`
			`except Exception as e:`
			`print(f"Error downloading image {img_url}:")`

			`content = content_text`
			`else:`
			`content = ''`

			`excerpt = item.find('excerpt:encoded', namespaces)`
			`excerpt = unescape(`
			`excerpt.text) if excerpt is not None and excerpt.text else ''`

			`categories = [cat.text for cat in item.findall(`
			`'category[@domain="category"]')]`
			`tags = [tag.text for tag in item.findall(`
			`'category[@domain="post_tag"]')]`

			`# Prepare frontmatter`
			`frontmatter = {`
			`'title': title,`
			`'date': post_date,`
			`'draft': status != 'publish',`
			`'categories': categories,`
			`'tags': tags,`
update card gen script 2024-09-09 12:57:25 +01:00			`'type': post_type + 's',`
			`'url': post_slug`
add wp2hugo script 2024-09-08 17:01:43 +01:00			`}`

			`# Add this after the existing mf2_photo print statement`
			`if mf2_photo:`
			`mf2_urls = process_mf2_download(mf2_photo, output_dir)`
			`frontmatter['photo'] = [{'url': url} for url in mf2_urls]`

			`# Create folder structure and filename`
			`year = post_datetime.strftime('%Y')`
			`month = post_datetime.strftime('%m')`
			`day = post_datetime.strftime('%d')`
			`if title:`
			`post_name = sanitize_filename(title)`
			`else:`
			`# If no title, use datestamp and random suffix`
			`datestamp = post_datetime.strftime('%Y%m%d')`
			`random_suffix = uuid.uuid4().hex[:8]`
			`post_name = f"{datestamp}_{random_suffix}"`

			`# Create folder structure with post type before date`
			`folder_path = os.path.join(`
			`output_dir, post_type + 's', year, month, day)`
			`os.makedirs(folder_path, exist_ok=True)`

			`filename = f"{post_name}.md"`
			`filepath = os.path.join(folder_path, filename)`

			`# No need for separate type_dir creation as it's now part of the main folder structure`

			`with open(filepath, 'w', encoding='utf-8') as f:`
			`# Write YAML frontmatter`
			`f.write('---\n')`
			`yaml.dump(frontmatter, f, default_flow_style=False)`
			`f.write('---\n\n')`

			`# Write content`
			`if excerpt:`
			`f.write(f"{excerpt}\n\n<!--more-->\n\n")`
			`f.write(content)`

			`print(f"Created file: {filepath}")`


			`if __name__ == '__main__':`
			`parse_wordpress_xml()`