Unified the various modules and added initial support for multiple sites

2025-06-04 10:10:05 +00:00 · 2024-01-13 00:31:25 +01:00 · 2024-01-13 00:31:25 +01:00 · adabc100e6
commit adabc100e6
parent 53e4b5c49d
10 changed files with 157 additions and 136 deletions
--- a/.env.example
+++ b/.env.example
@ -1,2 +1,15 @@
 PPLX_API_KEY="your perplexity ai key"
 MODEL="pplx-7b-chat"
 NEWS="world-news"
 POSSIBLE_NEWS_VALUES= '
    "world-news",
    "us-news",
    "politics",
    "sports",
    "entertainment",
    "business",
    "science",
    "ap-fact-check",
    "oddities",
    "health"
 '
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,5 @@
 link
-allsides.html
+ap.html
 test.html
 response
 models/
--- a/README.md
+++ b/README.md
@ -8,12 +8,12 @@ I maintain a daily updated (if I use this software) archive section. This is mad
 ## Disclaimer
-MySides is a personal tool designed to scrape news from AllSides. Please note that all material downloaded, used, and reworked by this software is the property of AllSides. This tool aims to provide a quick overview of daily news. For more information, please refer to the AllSides Terms of Service.
+MySides is a personal tool designed to scrape news from APNews. Please note that all material downloaded, used, and reworked by this software is the property of APNews. This tool aims to provide a quick overview of daily news. For more information, please refer to the APNews Terms of Service.
 ## TLDR
-MySides scrapes the latest news from AllSides and uses Perplexity AI APIs to summarize them into a concise, single-page format.
+MySides scrapes the latest news from APNews and uses Perplexity AI APIs to summarize them into a concise, single-page format.
 ## Perplexity AI?
@ -31,4 +31,4 @@ In my experience, Perplexity AI offers more competitive API pricing than OpenAI.
 ## Read
-Check out allsides.html for the latest summary. The reports are saved into archive/ by default.
+Check out ap.html for the latest summary. The reports are saved into archive/ by default.
--- a/pycache/apnews.cpython-311.pyc
+++ b/pycache/apnews.cpython-311.pyc
--- a/pycache/summarizer.cpython-311.pyc
+++ b/pycache/summarizer.cpython-311.pyc
--- a/apnews.py
+++ b/apnews.py
@ -0,0 +1,22 @@
 def fetchAndDigest(soup):
    news_items = soup.find_all("div", class_="PagePromo")
    print("[+] Filtering out invalid articles...")
    links = []
    for news_item in news_items:
        article_title = news_item['data-gtm-region']
        # Extract the article link and title
        try:
            article_link = news_item.find_all("div", class_="PagePromo-media").pop().find("a").get("href")
        except Exception:
            try:
                article_link = news_item.find_all("h3", class_="PagePromo-title").pop().find("a").get("href")
            except Exception:
                print("[!] Invalid article. Skipping...")
                print(news_item)
            continue
        links.append([article_title, article_link])
    print("[+] Total news: " + str(len(links)))
    return links
--- a/archive/20240112.html
+++ b/archive/20240112.html
--- a/archiver.sh
+++ b/archiver.sh
@ -8,4 +8,4 @@ CLEAN=${CLEAN// /_}
 CLEAN=${CLEAN//[^a-zA-Z0-9_]/}
 # finally, lowercase with TR
 CLEAN=`echo -n $CLEAN | tr A-Z a-z`
-cp allsides.html archive/$CLEAN.html
+cp ap.html archive/$CLEAN.html
--- a/main.py
+++ b/main.py
@ -1,68 +1,80 @@
 import os
 import requests
 from bs4 import BeautifulSoup
 from dotenv import load_dotenv
 # Our modules
 import apnews
 import summarizer
 load_dotenv()
 # Loading environment variables
 news_type = os.getenv("NEWS")
 pplx_api_key = os.getenv("PPLX_API_KEY")
 model = os.getenv("MODEL")
 # Main menu
 def menu():
    global news_type
    available_news = os.getenv("POSSIBLE_NEWS_VALUES")
    available_news = available_news.split(",")
    print("[ Welcome to MySides ]")
    print("[+] Available news: ")
    counter = 0
    for avail in available_news:
        counter += 1
        print(str(counter) + ") " + avail.strip().replace('"', ""))
    print("[+] Current news: " + news_type)
    print("[+] Press enter to continue or type a number to change the news type.")
    news_type_n = input().strip()
    if news_type_n == "":
        return
    try:
        news_type_n = int(news_type_n)
    except Exception:
        menu()
        print("[!] Invalid news type.")
    news_type_n -= 1
    try:
        news_type = available_news[news_type_n]
    except Exception:
        menu()
        print("[!] Invalid news type.")
 # Fetch and summarize the article
 def transform_links(links):
    datas = []
    counter = 0
    print("[+] Extracting data from articles...")
    for link in links:
        counter += 1
        print("[+] Article " + str(counter) + " of " + str(len(links)))
        article_title = link[0]
        article_link = link[1]
        print("[ " + article_title + " ]")
        print("[+] Extracting data from: " + article_link)
        try:
            article_summary = summarizer.summarize(article_link, pplx_api_key, model)
        except Exception as e:
            print(e)
            print("[!] Invalid article. Skipping...")
            continue
        datas.append(
            {
                "article_title": article_title,
                "article_link": article_link,
                "article_summary": article_summary,
            })
    return datas
 # Downloads the site and extracting the data using the appropriate module
 def extract_data(url):
    response = requests.get(url, timeout=5)
    soup = BeautifulSoup(response.text, "html.parser")
-    news_items = soup.find_all("div", class_="news-item")
+    links = apnews.fetchAndDigest(soup)
-    datas = []
+    transform_links(links)
    tot_articles = len(news_items)
    print("[+] Total news: " + str(tot_articles))
    print("[+] Filtering out invalid articles...")
    counter = 0
    for news_item in news_items:
        # Extract the article link and title
        article_link = news_item.find_all("a")[0].get("href")
        if "allsides.com" not in article_link:
            tot_articles -= 1
            continue
        counter += 1
        print("[+] Processing news: " + str(counter) + "/" + str(tot_articles))
        article_title = news_item.find("div", class_="news-title").text.strip()
        print("[*] Summarizing: " + article_link)
        # Summarize the article
        with open("link", "w+") as f:
            f.write(article_link)
        # trunk-ignore(bandit/B605)
        # trunk-ignore(bandit/B607)
        os.system("python summarizer.py")
        print("[OK] Done. Proceeding...")
        with open("response", "r") as f:
            article_summary = f.read().strip()
        # with open(article_title, "w+") as f:
        # f.write(article_summary)
        # Extract the source and media bias rating
        try:
            source_name = news_item.find("span").text
        except Exception:
            source_name = "Unknown"
        try:
            media_bias_rating = (
                news_item.find("img")
                .get("alt")
                .replace("AllSides Media Bias Rating: ", "")
                .lower()
            )
        except Exception:
            media_bias_rating = "Unknown"
        # Build the JSON
        data = {
            "article_link": article_link,
            "article_title": article_title,
            "article_summary": article_summary,
            "source_name": source_name,
            "media_bias_rating": media_bias_rating,
        }
        datas.append(data)
    return datas
 def handle_pagination(soup):
    next_page = soup.find("a", {"rel": "next"})
@ -72,7 +84,8 @@ def handle_pagination(soup):
 def main():
-    url = "https://www.allsides.com/unbiased-balanced-news"
+    global news_type
    url = "https://apnews.com/" + news_type
    all_data = []
    while url:
@ -102,37 +115,25 @@ def main():
    """
    # Create a nice HTML view of all the articles each one in its own page
-    html = "<html><head><title>AllSides Unbiased News</title>"
+    html = "<html><head><title>APNews Unbiased News</title>"
    html += "<style>" + css + "</style>"
    html += "</head><body>"
    for item in all_data:
        html += "<h1>" + item["article_title"] + "</h1>"
        html += "<h2>" + item["source_name"] + "</h2>"
        html += "<h3>" + item["media_bias_rating"] + "</h3>"
        html += "<p>" + item["article_summary"] + "</p>"
        html += "<a href='" + item["article_link"] + "'>Read the full article</a>"
        html += "<hr>"
    html += "</body></html>"
-    with open("allsides.html", "w+") as f:
+    with open("ap.html", "w+") as f:
        f.write(html)
    # Archiving (skip if causes errors)
    os.system("./archiver.sh")
    print("Total articles: ", len(all_data))
    # Do some math to find the number of articles per bias rating
    bias_ratings = {}
    for item in all_data:
        if item["media_bias_rating"] in bias_ratings:
            bias_ratings[item["media_bias_rating"]] += 1
        else:
            bias_ratings[item["media_bias_rating"]] = 1
    # Assign percentages
    for key in bias_ratings:
        bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
    print(bias_ratings)
 if __name__ == "__main__":
    menu()
    print("[+] News type: " + news_type)
    main()
--- a/summarizer.py
+++ b/summarizer.py
@ -1,46 +1,36 @@
 import os
 import requests
 from dotenv import load_dotenv
-load_dotenv()
+def summarize(link, pplx_api_key, model):
    headers = {
        "accept": "application/json",
        "authorization": "Bearer " + pplx_api_key,
        "content-type": "application/json",
    }
-pplx_api_key = os.getenv("PPLX_API_KEY")
+    json_data = {
-model = os.getenv("MODEL")
+        "model": model,
        "messages": [
            {
                "role": "system",
                "content": "Be precise, concise and clear. Also proofread what you write and make sure not to hallucinate.",
            },
            {
                "role": "user",
                "content": "Read and summarize: " + link,
            },
        ],
    }
-with open("link", "r") as f:
+    response = requests.post(
-    article_link = f.read().strip()
+        "https://api.perplexity.ai/chat/completions",
        headers=headers,
        json=json_data,
        timeout=5,
    )
-headers = {
+    response = response.json()
-    "accept": "application/json",
+    # print(response)
-    "authorization": "Bearer " + pplx_api_key,
+    try:
-    "content-type": "application/json",
+        return response["choices"][0]["message"]["content"]
-}
+    except Exception as e:
-
+        return "Error: " + str(e)
 json_data = {
    "model": model,
    "messages": [
        {
            "role": "system",
            "content": "Be precise, concise and clear",
        },
        {
            "role": "user",
            "content": "Search and summarize: " + article_link,
        },
    ],
 }
 response = requests.post(
    "https://api.perplexity.ai/chat/completions",
    headers=headers,
    json=json_data,
    timeout=5,
 )
 response = response.json()
 # print(response)
 # print(response["choices"][0]["message"]["content"])
 with open("response", "w+") as response_file:
    response_file.write(response["choices"][0]["message"]["content"])