brainsteam.co.uk/bstools/bstools.py

210 lines
6.8 KiB
Python
Raw Permalink Normal View History

2022-01-02 16:37:40 +00:00
import click
import dotenv
import os
import requests
import ujson
import frontmatter
2022-01-02 16:37:40 +00:00
from urllib.parse import urlparse
2024-09-07 09:22:56 +01:00
from bs4 import BeautifulSoup
def get_html_title(url):
"""
Fetches the HTML content from a given URL and returns its title.
Args:
url (str): The URL to fetch HTML content from.
Returns:
str: The title of the fetched HTML content, or None if it couldn't be found.
"""
try:
# Send an HTTP GET request to the URL
response = requests.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find and return the title of the HTML document
title = None
if soup.title:
title = soup.title.string
# Return None if no title could be found
if not title:
return None
return title
else:
print(
f"Failed to fetch HTML content. Status code: {response.status_code}")
except Exception as e:
print(f"An error occurred: {e}")
2022-01-02 16:37:40 +00:00
@click.group()
def cli():
dotenv.load_dotenv()
pass
2024-09-07 09:22:56 +01:00
@cli.command()
@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)
def fetch_link_titles(folder):
"""Fetch titles for reply and bookmark links"""
for root, _, files in os.walk(folder):
for file in files:
if file.endswith(".md"):
full_path = os.path.join(root, file)
data = frontmatter.load(full_path)
print(f"Analysing... {full_path}")
2024-09-08 12:09:41 +01:00
properties_to_check = ['in-reply-to', 'bookmark-of']
updated = False
for property_name in properties_to_check:
property_data = data.get(property_name)
if property_data:
if isinstance(property_data, str):
if 'twitter.com' in property_data:
print(f"Not grabbing title for tweet in {property_name}")
continue
2024-09-07 09:22:56 +01:00
2024-09-08 12:09:41 +01:00
title = get_html_title(property_data)
2024-09-07 09:22:56 +01:00
2024-09-08 12:09:41 +01:00
if title is not None:
print(f"Found {property_name} title: '{title}'")
data[property_name] = {"url": property_data, "title": str(title)}
updated = True
2024-09-07 09:22:56 +01:00
2024-09-08 12:09:41 +01:00
elif isinstance(property_data, dict) and 'url' in property_data:
2024-09-07 09:22:56 +01:00
2024-09-08 12:09:41 +01:00
if 'twitter.com' in property_data['url']:
print(f"Not grabbing title for tweet in {property_name}")
continue
2024-09-07 09:22:56 +01:00
2024-09-08 12:09:41 +01:00
if 'title' not in property_data:
title = get_html_title(property_data['url'])
if title is not None:
print(f"Found {property_name} title: '{title}'")
property_data['title'] = str(title)
data[property_name] = property_data
updated = True
2024-09-07 09:22:56 +01:00
2024-09-08 12:09:41 +01:00
if updated:
print(f"Updating data... {full_path}")
with open(full_path, 'wb') as f:
frontmatter.dump(data, f)
2024-09-07 09:22:56 +01:00
@cli.command()
@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)
@click.option("--old_type", type=str, required=True)
@click.option("--new_type", type=str, required=True)
def fix_post_types(folder: str, old_type: str, new_type: str):
"""Fix post type metadata"""
2023-07-09 11:33:28 +01:00
for root, _, files in os.walk(folder):
for file in files:
if file.endswith(".md"):
2024-09-07 09:22:56 +01:00
full_path = os.path.join(root, file)
data = frontmatter.load(full_path)
print(f"Analysing... {full_path}")
if 'type' not in data:
2024-09-07 09:22:56 +01:00
print(
f"Skipping {full_path} due to incomplete frontmatter")
continue
2024-09-07 09:22:56 +01:00
if (data['type'] == old_type):
print(
f"Update type for {full_path}: {old_type}->{new_type}")
data['type'] = new_type
2024-09-07 09:22:56 +01:00
with open(full_path, 'wb') as f:
frontmatter.dump(data, f)
2023-07-09 11:33:28 +01:00
@cli.command()
@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)
@click.option("--page_meta", type=str, help="comma separated list of fields to include in page meta", required=True)
def set_page_meta(folder: str, page_meta: str):
meta = page_meta.split(",")
for root, _, files in os.walk(folder):
for file in files:
if file.endswith(".md"):
2024-09-07 09:22:56 +01:00
full_path = os.path.join(root, file)
2023-07-09 11:33:28 +01:00
data = frontmatter.load(full_path)
print(f"Update page_meta for {full_path}: {meta}")
if 'page_meta' in data:
del data['page_meta']
data['post_meta'] = meta
2024-09-07 09:22:56 +01:00
with open(full_path, 'wb') as f:
2023-07-09 11:33:28 +01:00
frontmatter.dump(data, f)
2022-01-02 16:37:40 +00:00
@cli.command()
@click.option("--mentions-file", type=click.Path(file_okay=True), required=True)
def fetch_mentions(mentions_file: str):
"""Fetch web mentions and store as json"""
mention_ids = set()
2024-09-07 09:22:56 +01:00
2022-01-02 16:37:40 +00:00
if os.path.exists(mentions_file):
print(f"Load existing mentions from {mentions_file}")
2024-09-07 09:22:56 +01:00
with open(mentions_file, 'r') as f:
2022-01-02 16:37:40 +00:00
mentions = ujson.load(f)
print(mentions.keys())
print(f"Found existing mentions for {len(mentions.keys())} urls")
else:
mentions = {}
for mentionset in mentions.values():
mention_ids.update([post['id'] for post in mentionset])
print("Requesting new mentions...")
2024-09-07 09:22:56 +01:00
r = requests.get(
f"https://webmention.io/api/mentions.json?token={os.environ.get('WEBMENTIONSIO_API_KEY')}")
2022-01-02 16:37:40 +00:00
if r.json().get('error') is not None:
print(f"Failed to request webmentions: {r.json()}")
return 1
new = 0
for link in r.json()['links']:
target = urlparse(link['target']).path
if target not in mentions:
mentions[target] = []
if link['id'] not in mention_ids:
mention_ids.add(link['id'])
mentions[target].append(link)
new += 1
print(f"Found {new} new mentions")
print(f"Storing mentions at {mentions_file}")
2024-09-07 09:22:56 +01:00
with open(mentions_file, 'w') as f:
2022-01-02 16:37:40 +00:00
ujson.dump(mentions, f, indent=2)
if __name__ == "__main__":
2024-09-07 09:22:56 +01:00
cli()