brainsteam.co.uk/bstools/bstools.py

import click
import dotenv
import os
import requests
import ujson
import frontmatter
from urllib.parse import urlparse

from bs4 import BeautifulSoup


def get_html_title(url):
    """
    Fetches the HTML content from a given URL and returns its title.

    Args:
        url (str): The URL to fetch HTML content from.

    Returns:
        str: The title of the fetched HTML content, or None if it couldn't be found.
    """
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find and return the title of the HTML document
            title = None

            if soup.title:
                title = soup.title.string

            # Return None if no title could be found
            if not title:
                return None

            return title

        else:
            print(
                f"Failed to fetch HTML content. Status code: {response.status_code}")

    except Exception as e:
        print(f"An error occurred: {e}")


@click.group()
def cli():
    dotenv.load_dotenv()
    pass

@cli.command()
@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)
def fetch_link_titles(folder):
    """Fetch titles for reply and bookmark links"""

    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith(".md"):
                full_path = os.path.join(root, file)
                data = frontmatter.load(full_path)

                print(f"Analysing... {full_path}")

                properties_to_check = ['in-reply-to', 'bookmark-of']
                updated = False

                for property_name in properties_to_check:

                    property_data = data.get(property_name)

                    if property_data:
                        if isinstance(property_data, str):
                            if 'twitter.com' in property_data:
                                print(f"Not grabbing title for tweet in {property_name}")
                                continue

                            title = get_html_title(property_data)

                            if title is not None:
                                        print(f"Found {property_name} title: '{title}'")
                                        data[property_name] = {"url": property_data, "title": str(title)}
                                        updated = True

                            elif isinstance(property_data, dict) and 'url' in property_data:

                                if 'twitter.com' in property_data['url']:
                                    print(f"Not grabbing title for tweet in {property_name}")
                                    continue

                                if 'title' not in property_data:
                                    title = get_html_title(property_data['url'])
                                    if title is not None:
                                        print(f"Found {property_name} title: '{title}'")
                                        property_data['title'] = str(title)
                                        data[property_name] = property_data
                                        updated = True

                if updated:
                    print(f"Updating data... {full_path}")
                    with open(full_path, 'wb') as f:
                        frontmatter.dump(data, f)


@cli.command()
@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)
@click.option("--old_type", type=str, required=True)
@click.option("--new_type", type=str, required=True)
def fix_post_types(folder: str, old_type: str, new_type: str):
    """Fix post type metadata"""

    for root, _, files in os.walk(folder):

        for file in files:
            if file.endswith(".md"):
                full_path = os.path.join(root, file)
                data = frontmatter.load(full_path)

                print(f"Analysing... {full_path}")

                if 'type' not in data:
                    print(
                        f"Skipping {full_path} due to incomplete frontmatter")
                    continue

                if (data['type'] == old_type):
                    print(
                        f"Update type for {full_path}: {old_type}->{new_type}")
                    data['type'] = new_type

                    with open(full_path, 'wb') as f:
                        frontmatter.dump(data, f)


@cli.command()
@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)
@click.option("--page_meta", type=str, help="comma separated list of fields to include in page meta", required=True)
def set_page_meta(folder: str, page_meta: str):

    meta = page_meta.split(",")

    for root, _, files in os.walk(folder):

        for file in files:
            if file.endswith(".md"):
                full_path = os.path.join(root, file)
                data = frontmatter.load(full_path)

                print(f"Update page_meta for {full_path}: {meta}")
                if 'page_meta' in data:
                    del data['page_meta']
                data['post_meta'] = meta

                with open(full_path, 'wb') as f:
                    frontmatter.dump(data, f)


@cli.command()
@click.option("--mentions-file", type=click.Path(file_okay=True), required=True)
def fetch_mentions(mentions_file: str):
    """Fetch web mentions and store as json"""

    mention_ids = set()

    if os.path.exists(mentions_file):
        print(f"Load existing mentions from {mentions_file}")
        with open(mentions_file, 'r') as f:
            mentions = ujson.load(f)
            print(mentions.keys())
            print(f"Found existing mentions for {len(mentions.keys())} urls")
    else:
        mentions = {}

    for mentionset in mentions.values():
        mention_ids.update([post['id'] for post in mentionset])

    print("Requesting new mentions...")
    r = requests.get(
        f"https://webmention.io/api/mentions.json?token={os.environ.get('WEBMENTIONSIO_API_KEY')}")

    if r.json().get('error') is not None:
        print(f"Failed to request webmentions: {r.json()}")
        return 1

    new = 0
    for link in r.json()['links']:
        target = urlparse(link['target']).path

        if target not in mentions:
            mentions[target] = []

        if link['id'] not in mention_ids:
            mention_ids.add(link['id'])
            mentions[target].append(link)
            new += 1

    print(f"Found {new} new mentions")

    print(f"Storing mentions at {mentions_file}")
    with open(mentions_file, 'w') as f:
        ujson.dump(mentions, f, indent=2)


if __name__ == "__main__":
    cli()
add BStools 2022-01-02 16:37:40 +00:00			`import click`
			`import dotenv`
			`import os`
			`import requests`
			`import ujson`
add command to bstools for retyping posts 2023-07-09 11:06:57 +01:00			`import frontmatter`
add BStools 2022-01-02 16:37:40 +00:00			`from urllib.parse import urlparse`

fix some stuff 2024-09-07 09:22:56 +01:00			`from bs4 import BeautifulSoup`


			`def get_html_title(url):`
			`"""`
			`Fetches the HTML content from a given URL and returns its title.`

			`Args:`
			`url (str): The URL to fetch HTML content from.`

			`Returns:`
			`str: The title of the fetched HTML content, or None if it couldn't be found.`
			`"""`
			`try:`
			`# Send an HTTP GET request to the URL`
			`response = requests.get(url)`

			`# Check if the request was successful (status code 200)`
			`if response.status_code == 200:`
			`# Parse the HTML content using BeautifulSoup`
			`soup = BeautifulSoup(response.content, 'html.parser')`

			`# Find and return the title of the HTML document`
			`title = None`

			`if soup.title:`
			`title = soup.title.string`

			`# Return None if no title could be found`
			`if not title:`
			`return None`

			`return title`

			`else:`
			`print(`
			`f"Failed to fetch HTML content. Status code: {response.status_code}")`

			`except Exception as e:`
			`print(f"An error occurred: {e}")`


add BStools 2022-01-02 16:37:40 +00:00			`@click.group()`
			`def cli():`
			`dotenv.load_dotenv()`
			`pass`

fix some stuff 2024-09-07 09:22:56 +01:00			`@cli.command()`
			`@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)`
			`def fetch_link_titles(folder):`
			`"""Fetch titles for reply and bookmark links"""`

			`for root, _, files in os.walk(folder):`
			`for file in files:`
			`if file.endswith(".md"):`
			`full_path = os.path.join(root, file)`
			`data = frontmatter.load(full_path)`

			`print(f"Analysing... {full_path}")`

implement title getter for bookmarks 2024-09-08 12:09:41 +01:00			`properties_to_check = ['in-reply-to', 'bookmark-of']`
			`updated = False`

			`for property_name in properties_to_check:`

			`property_data = data.get(property_name)`

			`if property_data:`
			`if isinstance(property_data, str):`
			`if 'twitter.com' in property_data:`
			`print(f"Not grabbing title for tweet in {property_name}")`
			`continue`
fix some stuff 2024-09-07 09:22:56 +01:00
implement title getter for bookmarks 2024-09-08 12:09:41 +01:00			`title = get_html_title(property_data)`
fix some stuff 2024-09-07 09:22:56 +01:00
implement title getter for bookmarks 2024-09-08 12:09:41 +01:00			`if title is not None:`
			`print(f"Found {property_name} title: '{title}'")`
			`data[property_name] = {"url": property_data, "title": str(title)}`
			`updated = True`
fix some stuff 2024-09-07 09:22:56 +01:00
implement title getter for bookmarks 2024-09-08 12:09:41 +01:00			`elif isinstance(property_data, dict) and 'url' in property_data:`
fix some stuff 2024-09-07 09:22:56 +01:00
implement title getter for bookmarks 2024-09-08 12:09:41 +01:00			`if 'twitter.com' in property_data['url']:`
			`print(f"Not grabbing title for tweet in {property_name}")`
			`continue`
fix some stuff 2024-09-07 09:22:56 +01:00
implement title getter for bookmarks 2024-09-08 12:09:41 +01:00			`if 'title' not in property_data:`
			`title = get_html_title(property_data['url'])`
			`if title is not None:`
			`print(f"Found {property_name} title: '{title}'")`
			`property_data['title'] = str(title)`
			`data[property_name] = property_data`
			`updated = True`
fix some stuff 2024-09-07 09:22:56 +01:00
implement title getter for bookmarks 2024-09-08 12:09:41 +01:00			`if updated:`
			`print(f"Updating data... {full_path}")`
			`with open(full_path, 'wb') as f:`
			`frontmatter.dump(data, f)`
fix some stuff 2024-09-07 09:22:56 +01:00

add command to bstools for retyping posts 2023-07-09 11:06:57 +01:00			`@cli.command()`
			`@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)`
			`@click.option("--old_type", type=str, required=True)`
			`@click.option("--new_type", type=str, required=True)`
			`def fix_post_types(folder: str, old_type: str, new_type: str):`
			`"""Fix post type metadata"""`

add page meta tool for bstools 2023-07-09 11:33:28 +01:00			`for root, _, files in os.walk(folder):`
add command to bstools for retyping posts 2023-07-09 11:06:57 +01:00
			`for file in files:`
			`if file.endswith(".md"):`
fix some stuff 2024-09-07 09:22:56 +01:00			`full_path = os.path.join(root, file)`
add command to bstools for retyping posts 2023-07-09 11:06:57 +01:00			`data = frontmatter.load(full_path)`

			`print(f"Analysing... {full_path}")`

			`if 'type' not in data:`
fix some stuff 2024-09-07 09:22:56 +01:00			`print(`
			`f"Skipping {full_path} due to incomplete frontmatter")`
add command to bstools for retyping posts 2023-07-09 11:06:57 +01:00			`continue`

fix some stuff 2024-09-07 09:22:56 +01:00			`if (data['type'] == old_type):`
			`print(`
			`f"Update type for {full_path}: {old_type}->{new_type}")`
add command to bstools for retyping posts 2023-07-09 11:06:57 +01:00			`data['type'] = new_type`

fix some stuff 2024-09-07 09:22:56 +01:00			`with open(full_path, 'wb') as f:`
add command to bstools for retyping posts 2023-07-09 11:06:57 +01:00			`frontmatter.dump(data, f)`


add page meta tool for bstools 2023-07-09 11:33:28 +01:00			`@cli.command()`
			`@click.option("--folder", type=click.Path(dir_okay=True, file_okay=False), required=True)`
			`@click.option("--page_meta", type=str, help="comma separated list of fields to include in page meta", required=True)`
			`def set_page_meta(folder: str, page_meta: str):`

			`meta = page_meta.split(",")`

			`for root, _, files in os.walk(folder):`

			`for file in files:`
			`if file.endswith(".md"):`
fix some stuff 2024-09-07 09:22:56 +01:00			`full_path = os.path.join(root, file)`
add page meta tool for bstools 2023-07-09 11:33:28 +01:00			`data = frontmatter.load(full_path)`

			`print(f"Update page_meta for {full_path}: {meta}")`
			`if 'page_meta' in data:`
			`del data['page_meta']`
			`data['post_meta'] = meta`
add command to bstools for retyping posts 2023-07-09 11:06:57 +01:00
fix some stuff 2024-09-07 09:22:56 +01:00			`with open(full_path, 'wb') as f:`
add page meta tool for bstools 2023-07-09 11:33:28 +01:00			`frontmatter.dump(data, f)`
add command to bstools for retyping posts 2023-07-09 11:06:57 +01:00

add BStools 2022-01-02 16:37:40 +00:00			`@cli.command()`
			`@click.option("--mentions-file", type=click.Path(file_okay=True), required=True)`
			`def fetch_mentions(mentions_file: str):`
			`"""Fetch web mentions and store as json"""`

			`mention_ids = set()`
fix some stuff 2024-09-07 09:22:56 +01:00
add BStools 2022-01-02 16:37:40 +00:00			`if os.path.exists(mentions_file):`
			`print(f"Load existing mentions from {mentions_file}")`
fix some stuff 2024-09-07 09:22:56 +01:00			`with open(mentions_file, 'r') as f:`
add BStools 2022-01-02 16:37:40 +00:00			`mentions = ujson.load(f)`
			`print(mentions.keys())`
			`print(f"Found existing mentions for {len(mentions.keys())} urls")`
			`else:`
			`mentions = {}`

			`for mentionset in mentions.values():`
			`mention_ids.update([post['id'] for post in mentionset])`

			`print("Requesting new mentions...")`
fix some stuff 2024-09-07 09:22:56 +01:00			`r = requests.get(`
			`f"https://webmention.io/api/mentions.json?token={os.environ.get('WEBMENTIONSIO_API_KEY')}")`
add BStools 2022-01-02 16:37:40 +00:00
			`if r.json().get('error') is not None:`
			`print(f"Failed to request webmentions: {r.json()}")`
			`return 1`

			`new = 0`
			`for link in r.json()['links']:`
			`target = urlparse(link['target']).path`

			`if target not in mentions:`
			`mentions[target] = []`

			`if link['id'] not in mention_ids:`
			`mention_ids.add(link['id'])`
			`mentions[target].append(link)`
			`new += 1`

			`print(f"Found {new} new mentions")`

			`print(f"Storing mentions at {mentions_file}")`
fix some stuff 2024-09-07 09:22:56 +01:00			`with open(mentions_file, 'w') as f:`
add BStools 2022-01-02 16:37:40 +00:00			`ujson.dump(mentions, f, indent=2)`


			`if __name__ == "__main__":`
fix some stuff 2024-09-07 09:22:56 +01:00			`cli()`