add wp2hugo script
This commit is contained in:
parent
d59575f8b2
commit
42a1da0caa
|
@ -118,6 +118,16 @@ files = [
|
|||
beautifulsoup4 = ">=4.9,<5"
|
||||
six = ">=1.15,<2"
|
||||
|
||||
[[package]]
|
||||
name = "phpserialize"
|
||||
version = "1.3"
|
||||
description = "a port of the serialize and unserialize functions of php to python."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "phpserialize-1.3.tar.gz", hash = "sha256:bf672d312d203d09a84c26366fab8f438a3ffb355c407e69974b7ef2d39a0fa7"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-dotenv"
|
||||
version = "0.19.2"
|
||||
|
@ -346,4 +356,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.7"
|
||||
content-hash = "1672b483488a4907061160b05790c138fb3e199ace6a2b5374a4512a76a49c2a"
|
||||
content-hash = "0385a6569e499e440a9b5f195b4e14d524bcddf01a7f5a415fa017f0aaee6cb0"
|
||||
|
|
|
@ -17,6 +17,7 @@ python-dotenv = "^0.19.2"
|
|||
markdownify = "^0.11.6"
|
||||
python-frontmatter = "^1.0.0"
|
||||
beautifulsoup4 = "^4.12.3"
|
||||
phpserialize = "^1.3"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
|
||||
|
|
|
@ -0,0 +1,247 @@
|
|||
from multiprocessing import process
|
||||
from typing import List
|
||||
import click
|
||||
import xml.etree.ElementTree as ET
|
||||
from html import unescape
|
||||
from datetime import datetime
|
||||
import os
|
||||
import re
|
||||
import yaml
|
||||
import requests
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
import uuid
|
||||
import phpserialize
|
||||
|
||||
|
||||
def generate_unique_filename(original_filename):
|
||||
name, ext = os.path.splitext(original_filename)
|
||||
return f"{name}_{uuid.uuid4().hex[:8]}{ext}"
|
||||
|
||||
|
||||
def download_image(url, output_dir):
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
filename = os.path.basename(urlparse(url).path)
|
||||
filepath = os.path.join(output_dir, filename)
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(response.content)
|
||||
return filename
|
||||
return None
|
||||
|
||||
|
||||
def sanitize_filename(filename):
|
||||
return re.sub(r'[^\w\-_\. ]', '_', filename)
|
||||
|
||||
|
||||
TYPE_MAP = {
|
||||
'indieblocks_note': 'note',
|
||||
'indieblocks_like': 'like',
|
||||
}
|
||||
|
||||
WHITELIST_TYPES = ['post', 'note', 'like']
|
||||
|
||||
|
||||
def process_mf2_photo(postmeta, namespaces):
|
||||
mf2_photo = None
|
||||
meta_key = postmeta.find('wp:meta_key', namespaces).text
|
||||
if meta_key == 'mf2_photo':
|
||||
meta_value = postmeta.find('wp:meta_value', namespaces).text
|
||||
try:
|
||||
# Remove CDATA wrapper if present
|
||||
if meta_value.startswith('<![CDATA[') and meta_value.endswith(']]>'):
|
||||
meta_value = meta_value[9:-3]
|
||||
|
||||
# Decode the serialized PHP data
|
||||
decoded_value = phpserialize.loads(meta_value.encode('utf-8'))
|
||||
|
||||
print(decoded_value)
|
||||
|
||||
# Convert bytes to strings if necessary
|
||||
if isinstance(decoded_value, dict):
|
||||
mf2_photo = {k.decode('utf-8') if isinstance(k, bytes) else k:
|
||||
v.decode('utf-8') if isinstance(v, bytes) else v
|
||||
for k, v in decoded_value.items()}
|
||||
|
||||
if len(mf2_photo) > 0:
|
||||
mf2_photo = mf2_photo.values()
|
||||
|
||||
elif isinstance(decoded_value, list):
|
||||
mf2_photo = [v.decode('utf-8') if isinstance(v, bytes) else v
|
||||
for v in decoded_value]
|
||||
else:
|
||||
mf2_photo = decoded_value
|
||||
except Exception as e:
|
||||
print(f"Warning: Unable to parse mf2_photo: {str(e)}")
|
||||
return mf2_photo
|
||||
|
||||
|
||||
def process_mf2_download(mf2_photos, output_dir) -> List[str]:
|
||||
|
||||
photos = []
|
||||
|
||||
for mf2_photo in mf2_photos:
|
||||
if not mf2_photo:
|
||||
continue
|
||||
|
||||
media_dir = os.path.join(output_dir, 'media')
|
||||
os.makedirs(media_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
original_filename = download_image(mf2_photo, media_dir)
|
||||
if original_filename:
|
||||
new_filename = generate_unique_filename(original_filename)
|
||||
old_path = os.path.join(media_dir, original_filename)
|
||||
new_path = os.path.join(media_dir, new_filename)
|
||||
os.rename(old_path, new_path)
|
||||
|
||||
new_url = f'/media/{new_filename}'
|
||||
print(f"Downloaded and updated mf2_photo: {new_filename}")
|
||||
photos.append(new_url)
|
||||
except Exception as e:
|
||||
print(f"Error downloading mf2_photo {mf2_photo}: {str(e)}")
|
||||
|
||||
return photos # Return original URL if download failed
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--input', '-i', required=True, help='Input WordPress XML file')
|
||||
@click.option('--output-dir', '-o', default='output', help='Output directory for Markdown files')
|
||||
@click.option('--date-cutoff', '-d', type=click.DateTime(formats=["%Y-%m-%d"]),
|
||||
help='Date cutoff in YYYY-MM-DD format. Articles before this date will be skipped.')
|
||||
def parse_wordpress_xml(input, output_dir, date_cutoff):
|
||||
tree = ET.parse(input)
|
||||
root = tree.getroot()
|
||||
|
||||
namespaces = {
|
||||
'content': 'http://purl.org/rss/1.0/modules/content/',
|
||||
'wp': 'http://wordpress.org/export/1.2/',
|
||||
'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
|
||||
}
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
for item in root.findall('.//item', namespaces):
|
||||
post_date = item.find('wp:post_date', namespaces).text
|
||||
post_datetime = datetime.strptime(post_date, "%Y-%m-%d %H:%M:%S")
|
||||
|
||||
if date_cutoff and post_datetime < date_cutoff:
|
||||
continue # Skip articles before the cutoff date
|
||||
|
||||
title = item.find('title').text
|
||||
post_type = item.find('wp:post_type', namespaces).text
|
||||
post_id = item.find('wp:post_id', namespaces).text
|
||||
status = item.find('wp:status', namespaces).text
|
||||
|
||||
# Extract mf2_photo
|
||||
mf2_photo = None
|
||||
for postmeta in item.findall('wp:postmeta', namespaces):
|
||||
mf2_photo = process_mf2_photo(postmeta, namespaces)
|
||||
if mf2_photo:
|
||||
break
|
||||
|
||||
if mf2_photo:
|
||||
print(f"Post ID: {post_id}, mf2_photo: {mf2_photo}")
|
||||
|
||||
if post_type in TYPE_MAP:
|
||||
post_type = TYPE_MAP[post_type]
|
||||
|
||||
if post_type not in WHITELIST_TYPES:
|
||||
# print(f"Skipping {post_type} post with ID {post_id}")
|
||||
continue
|
||||
|
||||
content = item.find('content:encoded', namespaces)
|
||||
if content is not None and content.text:
|
||||
content_text = unescape(content.text)
|
||||
|
||||
# Create media directory if it doesn't exist
|
||||
media_dir = os.path.join(output_dir, 'media')
|
||||
os.makedirs(media_dir, exist_ok=True)
|
||||
|
||||
# Find all image URLs in the content
|
||||
img_urls = re.findall(r'<img[^>]+src="([^">]+)"', content_text)
|
||||
|
||||
for img_url in img_urls:
|
||||
# Download the image
|
||||
try:
|
||||
original_filename = download_image(img_url, media_dir)
|
||||
if original_filename:
|
||||
# Generate a unique filename with a pseudorandom suffix
|
||||
new_filename = generate_unique_filename(
|
||||
original_filename)
|
||||
old_path = os.path.join(media_dir, original_filename)
|
||||
new_path = os.path.join(media_dir, new_filename)
|
||||
os.rename(old_path, new_path)
|
||||
|
||||
# Update the image URL in the content
|
||||
new_url = f'/media/{new_filename}'
|
||||
content_text = content_text.replace(img_url, new_url)
|
||||
print(f"Downloaded and updated image: {new_filename}")
|
||||
except Exception as e:
|
||||
print(f"Error downloading image {img_url}:")
|
||||
|
||||
content = content_text
|
||||
else:
|
||||
content = ''
|
||||
|
||||
excerpt = item.find('excerpt:encoded', namespaces)
|
||||
excerpt = unescape(
|
||||
excerpt.text) if excerpt is not None and excerpt.text else ''
|
||||
|
||||
categories = [cat.text for cat in item.findall(
|
||||
'category[@domain="category"]')]
|
||||
tags = [tag.text for tag in item.findall(
|
||||
'category[@domain="post_tag"]')]
|
||||
|
||||
# Prepare frontmatter
|
||||
frontmatter = {
|
||||
'title': title,
|
||||
'date': post_date,
|
||||
'draft': status != 'publish',
|
||||
'categories': categories,
|
||||
'tags': tags,
|
||||
'type': post_type + 's'
|
||||
}
|
||||
|
||||
# Add this after the existing mf2_photo print statement
|
||||
if mf2_photo:
|
||||
mf2_urls = process_mf2_download(mf2_photo, output_dir)
|
||||
frontmatter['photo'] = [{'url': url} for url in mf2_urls]
|
||||
|
||||
# Create folder structure and filename
|
||||
year = post_datetime.strftime('%Y')
|
||||
month = post_datetime.strftime('%m')
|
||||
day = post_datetime.strftime('%d')
|
||||
if title:
|
||||
post_name = sanitize_filename(title)
|
||||
else:
|
||||
# If no title, use datestamp and random suffix
|
||||
datestamp = post_datetime.strftime('%Y%m%d')
|
||||
random_suffix = uuid.uuid4().hex[:8]
|
||||
post_name = f"{datestamp}_{random_suffix}"
|
||||
|
||||
# Create folder structure with post type before date
|
||||
folder_path = os.path.join(
|
||||
output_dir, post_type + 's', year, month, day)
|
||||
os.makedirs(folder_path, exist_ok=True)
|
||||
|
||||
filename = f"{post_name}.md"
|
||||
filepath = os.path.join(folder_path, filename)
|
||||
|
||||
# No need for separate type_dir creation as it's now part of the main folder structure
|
||||
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
# Write YAML frontmatter
|
||||
f.write('---\n')
|
||||
yaml.dump(frontmatter, f, default_flow_style=False)
|
||||
f.write('---\n\n')
|
||||
|
||||
# Write content
|
||||
if excerpt:
|
||||
f.write(f"{excerpt}\n\n<!--more-->\n\n")
|
||||
f.write(content)
|
||||
|
||||
print(f"Created file: {filepath}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parse_wordpress_xml()
|
Loading…
Reference in New Issue