2022-01-02 16:37:40 +00:00
|
|
|
import click
|
|
|
|
import dotenv
|
|
|
|
import os
|
|
|
|
import requests
|
|
|
|
import ujson
|
2023-07-09 11:06:57 +01:00
|
|
|
import frontmatter
|
2022-01-02 16:37:40 +00:00
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
2024-09-07 09:22:56 +01:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
def get_html_title(url):
|
|
|
|
"""
|
|
|
|
Fetches the HTML content from a given URL and returns its title.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url (str): The URL to fetch HTML content from.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
str: The title of the fetched HTML content, or None if it couldn't be found.
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
# Send an HTTP GET request to the URL
|
|
|
|
response = requests.get(url)
|
|
|
|
|
|
|
|
# Check if the request was successful (status code 200)
|
|
|
|
if response.status_code == 200:
|
|
|
|
# Parse the HTML content using BeautifulSoup
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
|
|
|
|
# Find and return the title of the HTML document
|
|
|
|
title = None
|
|
|
|
|
|
|
|
if soup.title:
|
|
|
|
title = soup.title.string
|
|
|
|
|
|
|
|
# Return None if no title could be found
|
|
|
|
if not title:
|
|
|
|
return None
|
|
|
|
|
|
|
|
return title
|
|
|
|
|
|
|
|
else:
|
|
|
|
print(
|
|
|
|
f"Failed to fetch HTML content. Status code: {response.status_code}")
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
print(f"An error occurred: {e}")
|
|
|
|
|
|
|
|
|
2022-01-02 16:37:40 +00:00
|
|
|
@click.group()
|
|
|
|
def cli():
|
|
|
|
dotenv.load_dotenv()
|
|
|
|
pass
|
|
|
|
|
2024-09-07 09:22:56 +01:00
|
|
|
@cli.command()
|
|
|
|
@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)
|
|
|
|
def fetch_link_titles(folder):
|
|
|
|
"""Fetch titles for reply and bookmark links"""
|
|
|
|
|
|
|
|
for root, _, files in os.walk(folder):
|
|
|
|
for file in files:
|
|
|
|
if file.endswith(".md"):
|
|
|
|
full_path = os.path.join(root, file)
|
|
|
|
data = frontmatter.load(full_path)
|
|
|
|
|
|
|
|
print(f"Analysing... {full_path}")
|
|
|
|
|
2024-09-08 12:09:41 +01:00
|
|
|
properties_to_check = ['in-reply-to', 'bookmark-of']
|
|
|
|
updated = False
|
|
|
|
|
|
|
|
for property_name in properties_to_check:
|
|
|
|
|
|
|
|
property_data = data.get(property_name)
|
|
|
|
|
|
|
|
if property_data:
|
|
|
|
if isinstance(property_data, str):
|
|
|
|
if 'twitter.com' in property_data:
|
|
|
|
print(f"Not grabbing title for tweet in {property_name}")
|
|
|
|
continue
|
2024-09-07 09:22:56 +01:00
|
|
|
|
2024-09-08 12:09:41 +01:00
|
|
|
title = get_html_title(property_data)
|
2024-09-07 09:22:56 +01:00
|
|
|
|
2024-09-08 12:09:41 +01:00
|
|
|
if title is not None:
|
|
|
|
print(f"Found {property_name} title: '{title}'")
|
|
|
|
data[property_name] = {"url": property_data, "title": str(title)}
|
|
|
|
updated = True
|
2024-09-07 09:22:56 +01:00
|
|
|
|
2024-09-08 12:09:41 +01:00
|
|
|
elif isinstance(property_data, dict) and 'url' in property_data:
|
2024-09-07 09:22:56 +01:00
|
|
|
|
2024-09-08 12:09:41 +01:00
|
|
|
if 'twitter.com' in property_data['url']:
|
|
|
|
print(f"Not grabbing title for tweet in {property_name}")
|
|
|
|
continue
|
2024-09-07 09:22:56 +01:00
|
|
|
|
2024-09-08 12:09:41 +01:00
|
|
|
if 'title' not in property_data:
|
|
|
|
title = get_html_title(property_data['url'])
|
|
|
|
if title is not None:
|
|
|
|
print(f"Found {property_name} title: '{title}'")
|
|
|
|
property_data['title'] = str(title)
|
|
|
|
data[property_name] = property_data
|
|
|
|
updated = True
|
2024-09-07 09:22:56 +01:00
|
|
|
|
2024-09-08 12:09:41 +01:00
|
|
|
if updated:
|
|
|
|
print(f"Updating data... {full_path}")
|
|
|
|
with open(full_path, 'wb') as f:
|
|
|
|
frontmatter.dump(data, f)
|
2024-09-07 09:22:56 +01:00
|
|
|
|
|
|
|
|
2023-07-09 11:06:57 +01:00
|
|
|
@cli.command()
|
|
|
|
@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)
|
|
|
|
@click.option("--old_type", type=str, required=True)
|
|
|
|
@click.option("--new_type", type=str, required=True)
|
|
|
|
def fix_post_types(folder: str, old_type: str, new_type: str):
|
|
|
|
"""Fix post type metadata"""
|
|
|
|
|
2023-07-09 11:33:28 +01:00
|
|
|
for root, _, files in os.walk(folder):
|
2023-07-09 11:06:57 +01:00
|
|
|
|
|
|
|
for file in files:
|
|
|
|
if file.endswith(".md"):
|
2024-09-07 09:22:56 +01:00
|
|
|
full_path = os.path.join(root, file)
|
2023-07-09 11:06:57 +01:00
|
|
|
data = frontmatter.load(full_path)
|
|
|
|
|
|
|
|
print(f"Analysing... {full_path}")
|
|
|
|
|
|
|
|
if 'type' not in data:
|
2024-09-07 09:22:56 +01:00
|
|
|
print(
|
|
|
|
f"Skipping {full_path} due to incomplete frontmatter")
|
2023-07-09 11:06:57 +01:00
|
|
|
continue
|
|
|
|
|
2024-09-07 09:22:56 +01:00
|
|
|
if (data['type'] == old_type):
|
|
|
|
print(
|
|
|
|
f"Update type for {full_path}: {old_type}->{new_type}")
|
2023-07-09 11:06:57 +01:00
|
|
|
data['type'] = new_type
|
|
|
|
|
2024-09-07 09:22:56 +01:00
|
|
|
with open(full_path, 'wb') as f:
|
2023-07-09 11:06:57 +01:00
|
|
|
frontmatter.dump(data, f)
|
|
|
|
|
|
|
|
|
2023-07-09 11:33:28 +01:00
|
|
|
@cli.command()
|
|
|
|
@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)
|
|
|
|
@click.option("--page_meta", type=str, help="comma separated list of fields to include in page meta", required=True)
|
|
|
|
def set_page_meta(folder: str, page_meta: str):
|
|
|
|
|
|
|
|
meta = page_meta.split(",")
|
|
|
|
|
|
|
|
for root, _, files in os.walk(folder):
|
|
|
|
|
|
|
|
for file in files:
|
|
|
|
if file.endswith(".md"):
|
2024-09-07 09:22:56 +01:00
|
|
|
full_path = os.path.join(root, file)
|
2023-07-09 11:33:28 +01:00
|
|
|
data = frontmatter.load(full_path)
|
|
|
|
|
|
|
|
print(f"Update page_meta for {full_path}: {meta}")
|
|
|
|
if 'page_meta' in data:
|
|
|
|
del data['page_meta']
|
|
|
|
data['post_meta'] = meta
|
2023-07-09 11:06:57 +01:00
|
|
|
|
2024-09-07 09:22:56 +01:00
|
|
|
with open(full_path, 'wb') as f:
|
2023-07-09 11:33:28 +01:00
|
|
|
frontmatter.dump(data, f)
|
2023-07-09 11:06:57 +01:00
|
|
|
|
|
|
|
|
2022-01-02 16:37:40 +00:00
|
|
|
@cli.command()
|
|
|
|
@click.option("--mentions-file", type=click.Path(file_okay=True), required=True)
|
|
|
|
def fetch_mentions(mentions_file: str):
|
|
|
|
"""Fetch web mentions and store as json"""
|
|
|
|
|
|
|
|
mention_ids = set()
|
2024-09-07 09:22:56 +01:00
|
|
|
|
2022-01-02 16:37:40 +00:00
|
|
|
if os.path.exists(mentions_file):
|
|
|
|
print(f"Load existing mentions from {mentions_file}")
|
2024-09-07 09:22:56 +01:00
|
|
|
with open(mentions_file, 'r') as f:
|
2022-01-02 16:37:40 +00:00
|
|
|
mentions = ujson.load(f)
|
|
|
|
print(mentions.keys())
|
|
|
|
print(f"Found existing mentions for {len(mentions.keys())} urls")
|
|
|
|
else:
|
|
|
|
mentions = {}
|
|
|
|
|
|
|
|
for mentionset in mentions.values():
|
|
|
|
mention_ids.update([post['id'] for post in mentionset])
|
|
|
|
|
|
|
|
print("Requesting new mentions...")
|
2024-09-07 09:22:56 +01:00
|
|
|
r = requests.get(
|
|
|
|
f"https://webmention.io/api/mentions.json?token={os.environ.get('WEBMENTIONSIO_API_KEY')}")
|
2022-01-02 16:37:40 +00:00
|
|
|
|
|
|
|
if r.json().get('error') is not None:
|
|
|
|
print(f"Failed to request webmentions: {r.json()}")
|
|
|
|
return 1
|
|
|
|
|
|
|
|
new = 0
|
|
|
|
for link in r.json()['links']:
|
|
|
|
target = urlparse(link['target']).path
|
|
|
|
|
|
|
|
if target not in mentions:
|
|
|
|
mentions[target] = []
|
|
|
|
|
|
|
|
if link['id'] not in mention_ids:
|
|
|
|
mention_ids.add(link['id'])
|
|
|
|
mentions[target].append(link)
|
|
|
|
new += 1
|
|
|
|
|
|
|
|
print(f"Found {new} new mentions")
|
|
|
|
|
|
|
|
print(f"Storing mentions at {mentions_file}")
|
2024-09-07 09:22:56 +01:00
|
|
|
with open(mentions_file, 'w') as f:
|
2022-01-02 16:37:40 +00:00
|
|
|
ujson.dump(mentions, f, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-09-07 09:22:56 +01:00
|
|
|
cli()
|