brainsteam.co.uk/bstools/bstools.py

import click
import dotenv
import os
import requests
import ujson
import frontmatter
from urllib.parse import urlparse

from bs4 import BeautifulSoup


def get_html_title(url):
    """
    Fetches the HTML content from a given URL and returns its title.

    Args:
        url (str): The URL to fetch HTML content from.

    Returns:
        str: The title of the fetched HTML content, or None if it couldn't be found.
    """
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find and return the title of the HTML document
            title = None

            if soup.title:
                title = soup.title.string

            # Return None if no title could be found
            if not title:
                return None

            return title

        else:
            print(
                f"Failed to fetch HTML content. Status code: {response.status_code}")

    except Exception as e:
        print(f"An error occurred: {e}")


@click.group()
def cli():
    dotenv.load_dotenv()
    pass

@cli.command()
@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)
def fetch_link_titles(folder):
    """Fetch titles for reply and bookmark links"""

    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith(".md"):
                full_path = os.path.join(root, file)
                data = frontmatter.load(full_path)

                print(f"Analysing... {full_path}")

                properties_to_check = ['in-reply-to', 'bookmark-of']
                updated = False

                for property_name in properties_to_check:

                    property_data = data.get(property_name)

                    if property_data:
                        if isinstance(property_data, str):
                            if 'twitter.com' in property_data:
                                print(f"Not grabbing title for tweet in {property_name}")
                                continue

                            title = get_html_title(property_data)

                            if title is not None:
                                        print(f"Found {property_name} title: '{title}'")
                                        data[property_name] = {"url": property_data, "title": str(title)}
                                        updated = True

                            elif isinstance(property_data, dict) and 'url' in property_data:

                                if 'twitter.com' in property_data['url']:
                                    print(f"Not grabbing title for tweet in {property_name}")
                                    continue

                                if 'title' not in property_data:
                                    title = get_html_title(property_data['url'])
                                    if title is not None:
                                        print(f"Found {property_name} title: '{title}'")
                                        property_data['title'] = str(title)
                                        data[property_name] = property_data
                                        updated = True

                if updated:
                    print(f"Updating data... {full_path}")
                    with open(full_path, 'wb') as f:
                        frontmatter.dump(data, f)


@cli.command()
@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)
@click.option("--old_type", type=str, required=True)
@click.option("--new_type", type=str, required=True)
def fix_post_types(folder: str, old_type: str, new_type: str):
    """Fix post type metadata"""

    for root, _, files in os.walk(folder):

        for file in files:
            if file.endswith(".md"):
                full_path = os.path.join(root, file)
                data = frontmatter.load(full_path)

                print(f"Analysing... {full_path}")

                if 'type' not in data:
                    print(
                        f"Skipping {full_path} due to incomplete frontmatter")
                    continue

                if (data['type'] == old_type):
                    print(
                        f"Update type for {full_path}: {old_type}->{new_type}")
                    data['type'] = new_type

                    with open(full_path, 'wb') as f:
                        frontmatter.dump(data, f)


@cli.command()
@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)
@click.option("--page_meta", type=str, help="comma separated list of fields to include in page meta", required=True)
def set_page_meta(folder: str, page_meta: str):

    meta = page_meta.split(",")

    for root, _, files in os.walk(folder):

        for file in files:
            if file.endswith(".md"):
                full_path = os.path.join(root, file)
                data = frontmatter.load(full_path)

                print(f"Update page_meta for {full_path}: {meta}")
                if 'page_meta' in data:
                    del data['page_meta']
                data['post_meta'] = meta

                with open(full_path, 'wb') as f:
                    frontmatter.dump(data, f)


@cli.command()
@click.option("--mentions-file", type=click.Path(file_okay=True), required=True)
def fetch_mentions(mentions_file: str):
    """Fetch web mentions and store as json"""

    mention_ids = set()

    if os.path.exists(mentions_file):
        print(f"Load existing mentions from {mentions_file}")
        with open(mentions_file, 'r') as f:
            mentions = ujson.load(f)
            print(mentions.keys())
            print(f"Found existing mentions for {len(mentions.keys())} urls")
    else:
        mentions = {}

    for mentionset in mentions.values():
        mention_ids.update([post['id'] for post in mentionset])

    print("Requesting new mentions...")
    r = requests.get(
        f"https://webmention.io/api/mentions.json?token={os.environ.get('WEBMENTIONSIO_API_KEY')}")

    if r.json().get('error') is not None:
        print(f"Failed to request webmentions: {r.json()}")
        return 1

    new = 0
    for link in r.json()['links']:
        target = urlparse(link['target']).path

        if target not in mentions:
            mentions[target] = []

        if link['id'] not in mention_ids:
            mention_ids.add(link['id'])
            mentions[target].append(link)
            new += 1

    print(f"Found {new} new mentions")

    print(f"Storing mentions at {mentions_file}")
    with open(mentions_file, 'w') as f:
        ujson.dump(mentions, f, indent=2)


if __name__ == "__main__":
    cli()