import click import dotenv import os import requests import ujson import frontmatter from urllib.parse import urlparse from bs4 import BeautifulSoup def get_html_title(url): """ Fetches the HTML content from a given URL and returns its title. Args: url (str): The URL to fetch HTML content from. Returns: str: The title of the fetched HTML content, or None if it couldn't be found. """ try: # Send an HTTP GET request to the URL response = requests.get(url) # Check if the request was successful (status code 200) if response.status_code == 200: # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(response.content, 'html.parser') # Find and return the title of the HTML document title = None if soup.title: title = soup.title.string # Return None if no title could be found if not title: return None return title else: print( f"Failed to fetch HTML content. Status code: {response.status_code}") except Exception as e: print(f"An error occurred: {e}") @click.group() def cli(): dotenv.load_dotenv() pass @cli.command() @click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True) def fetch_link_titles(folder): """Fetch titles for reply and bookmark links""" for root, _, files in os.walk(folder): for file in files: if file.endswith(".md"): full_path = os.path.join(root, file) data = frontmatter.load(full_path) print(f"Analysing... {full_path}") properties_to_check = ['in-reply-to', 'bookmark-of'] updated = False for property_name in properties_to_check: property_data = data.get(property_name) if property_data: if isinstance(property_data, str): if 'twitter.com' in property_data: print(f"Not grabbing title for tweet in {property_name}") continue title = get_html_title(property_data) if title is not None: print(f"Found {property_name} title: '{title}'") data[property_name] = {"url": property_data, "title": str(title)} updated = True elif isinstance(property_data, dict) and 'url' in property_data: if 'twitter.com' in property_data['url']: print(f"Not grabbing title for tweet in {property_name}") continue if 'title' not in property_data: title = get_html_title(property_data['url']) if title is not None: print(f"Found {property_name} title: '{title}'") property_data['title'] = str(title) data[property_name] = property_data updated = True if updated: print(f"Updating data... {full_path}") with open(full_path, 'wb') as f: frontmatter.dump(data, f) @cli.command() @click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True) @click.option("--old_type", type=str, required=True) @click.option("--new_type", type=str, required=True) def fix_post_types(folder: str, old_type: str, new_type: str): """Fix post type metadata""" for root, _, files in os.walk(folder): for file in files: if file.endswith(".md"): full_path = os.path.join(root, file) data = frontmatter.load(full_path) print(f"Analysing... {full_path}") if 'type' not in data: print( f"Skipping {full_path} due to incomplete frontmatter") continue if (data['type'] == old_type): print( f"Update type for {full_path}: {old_type}->{new_type}") data['type'] = new_type with open(full_path, 'wb') as f: frontmatter.dump(data, f) @cli.command() @click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True) @click.option("--page_meta", type=str, help="comma separated list of fields to include in page meta", required=True) def set_page_meta(folder: str, page_meta: str): meta = page_meta.split(",") for root, _, files in os.walk(folder): for file in files: if file.endswith(".md"): full_path = os.path.join(root, file) data = frontmatter.load(full_path) print(f"Update page_meta for {full_path}: {meta}") if 'page_meta' in data: del data['page_meta'] data['post_meta'] = meta with open(full_path, 'wb') as f: frontmatter.dump(data, f) @cli.command() @click.option("--mentions-file", type=click.Path(file_okay=True), required=True) def fetch_mentions(mentions_file: str): """Fetch web mentions and store as json""" mention_ids = set() if os.path.exists(mentions_file): print(f"Load existing mentions from {mentions_file}") with open(mentions_file, 'r') as f: mentions = ujson.load(f) print(mentions.keys()) print(f"Found existing mentions for {len(mentions.keys())} urls") else: mentions = {} for mentionset in mentions.values(): mention_ids.update([post['id'] for post in mentionset]) print("Requesting new mentions...") r = requests.get( f"https://webmention.io/api/mentions.json?token={os.environ.get('WEBMENTIONSIO_API_KEY')}") if r.json().get('error') is not None: print(f"Failed to request webmentions: {r.json()}") return 1 new = 0 for link in r.json()['links']: target = urlparse(link['target']).path if target not in mentions: mentions[target] = [] if link['id'] not in mention_ids: mention_ids.add(link['id']) mentions[target].append(link) new += 1 print(f"Found {new} new mentions") print(f"Storing mentions at {mentions_file}") with open(mentions_file, 'w') as f: ujson.dump(mentions, f, indent=2) if __name__ == "__main__": cli()