Unified the various modules and added initial support for multiple sites

2025-07-29 05:51:54 +00:00 · 2024-01-13 00:31:25 +01:00 · 2024-01-13 00:31:25 +01:00 · adabc100e6
commit adabc100e6
parent 53e4b5c49d
10 changed files with 157 additions and 136 deletions
--- a/.env.example
+++ b/.env.example
@ -1,2 +1,15 @@
 PPLX_API_KEY="your perplexity ai key"
 MODEL="pplx-7b-chat"
+NEWS="world-news"
+POSSIBLE_NEWS_VALUES= '
+    "world-news",
+    "us-news",
+    "politics",
+    "sports",
+    "entertainment",
+    "business",
+    "science",
+    "ap-fact-check",
+    "oddities",
+    "health"
+'
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,5 @@
 link
-allsides.html
+ap.html
 test.html
 response
 models/
--- a/README.md
+++ b/README.md
@ -8,12 +8,12 @@ I maintain a daily updated (if I use this software) archive section. This is mad

 ## Disclaimer

-MySides is a personal tool designed to scrape news from AllSides. Please note that all material downloaded, used, and reworked by this software is the property of AllSides. This tool aims to provide a quick overview of daily news. For more information, please refer to the AllSides Terms of Service.
+MySides is a personal tool designed to scrape news from APNews. Please note that all material downloaded, used, and reworked by this software is the property of APNews. This tool aims to provide a quick overview of daily news. For more information, please refer to the APNews Terms of Service.


 ## TLDR

-MySides scrapes the latest news from AllSides and uses Perplexity AI APIs to summarize them into a concise, single-page format.
+MySides scrapes the latest news from APNews and uses Perplexity AI APIs to summarize them into a concise, single-page format.

 ## Perplexity AI?

@ -31,4 +31,4 @@ In my experience, Perplexity AI offers more competitive API pricing than OpenAI.

 ## Read

-Check out allsides.html for the latest summary. The reports are saved into archive/ by default.
+Check out ap.html for the latest summary. The reports are saved into archive/ by default.
--- a/pycache/apnews.cpython-311.pyc
+++ b/pycache/apnews.cpython-311.pyc
--- a/pycache/summarizer.cpython-311.pyc
+++ b/pycache/summarizer.cpython-311.pyc
--- a/apnews.py
+++ b/apnews.py
@ -0,0 +1,22 @@
+
+def fetchAndDigest(soup):
+
+    news_items = soup.find_all("div", class_="PagePromo")
+    print("[+] Filtering out invalid articles...")
+    links = []
+    for news_item in news_items:
+        article_title = news_item['data-gtm-region']
+        # Extract the article link and title
+        try:
+            article_link = news_item.find_all("div", class_="PagePromo-media").pop().find("a").get("href")
+        except Exception:
+            try:
+                article_link = news_item.find_all("h3", class_="PagePromo-title").pop().find("a").get("href")
+            except Exception:
+                print("[!] Invalid article. Skipping...")
+                print(news_item)
+            continue
+        links.append([article_title, article_link])
+
+    print("[+] Total news: " + str(len(links)))
+    return links
--- a/archive/20240112.html
+++ b/archive/20240112.html
--- a/archiver.sh
+++ b/archiver.sh
@ -8,4 +8,4 @@ CLEAN=${CLEAN// /_}
 CLEAN=${CLEAN//[^a-zA-Z0-9_]/}
 # finally, lowercase with TR
 CLEAN=`echo -n $CLEAN | tr A-Z a-z`
-cp allsides.html archive/$CLEAN.html
+cp ap.html archive/$CLEAN.html
--- a/main.py
+++ b/main.py
@ -1,68 +1,80 @@
 import os
-
 import requests
 from bs4 import BeautifulSoup
+from dotenv import load_dotenv

+# Our modules
+import apnews
+import summarizer

+load_dotenv()
+
+# Loading environment variables
+news_type = os.getenv("NEWS")
+pplx_api_key = os.getenv("PPLX_API_KEY")
+model = os.getenv("MODEL")
+
+# Main menu
+def menu():
+    global news_type
+    available_news = os.getenv("POSSIBLE_NEWS_VALUES")
+    available_news = available_news.split(",")
+    print("[ Welcome to MySides ]")
+    print("[+] Available news: ")
+    counter = 0
+    for avail in available_news:
+        counter += 1
+        print(str(counter) + ") " + avail.strip().replace('"', ""))
+
+    print("[+] Current news: " + news_type)
+    print("[+] Press enter to continue or type a number to change the news type.")
+    news_type_n = input().strip()
+    if news_type_n == "":
+        return
+    try:
+        news_type_n = int(news_type_n)
+    except Exception:
+        menu()
+        print("[!] Invalid news type.")
+    news_type_n -= 1
+    try:
+        news_type = available_news[news_type_n]
+    except Exception:
+        menu()
+        print("[!] Invalid news type.")
+
+# Fetch and summarize the article
+def transform_links(links):
+    datas = []
+    counter = 0
+    print("[+] Extracting data from articles...")
+    for link in links:
+        counter += 1
+        print("[+] Article " + str(counter) + " of " + str(len(links)))
+        article_title = link[0]
+        article_link = link[1]
+        print("[ " + article_title + " ]")
+        print("[+] Extracting data from: " + article_link)
+        try:
+            article_summary = summarizer.summarize(article_link, pplx_api_key, model)
+        except Exception as e:
+            print(e)
+            print("[!] Invalid article. Skipping...")
+            continue
+        datas.append(
+            {
+                "article_title": article_title,
+                "article_link": article_link,
+                "article_summary": article_summary,
+            })
+    return datas
+
+# Downloads the site and extracting the data using the appropriate module
 def extract_data(url):
    response = requests.get(url, timeout=5)
    soup = BeautifulSoup(response.text, "html.parser")
-    news_items = soup.find_all("div", class_="news-item")
-    datas = []
-    tot_articles = len(news_items)
-    print("[+] Total news: " + str(tot_articles))
-    print("[+] Filtering out invalid articles...")
-    counter = 0
-    for news_item in news_items:
-        # Extract the article link and title
-        article_link = news_item.find_all("a")[0].get("href")
-        if "allsides.com" not in article_link:
-            tot_articles -= 1
-            continue
-        counter += 1
-        print("[+] Processing news: " + str(counter) + "/" + str(tot_articles))
-        article_title = news_item.find("div", class_="news-title").text.strip()
-        print("[*] Summarizing: " + article_link)
-        # Summarize the article
-        with open("link", "w+") as f:
-            f.write(article_link)
-        # trunk-ignore(bandit/B605)
-        # trunk-ignore(bandit/B607)
-        os.system("python summarizer.py")
-        print("[OK] Done. Proceeding...")
-        with open("response", "r") as f:
-            article_summary = f.read().strip()
-        # with open(article_title, "w+") as f:
-        # f.write(article_summary)
-        # Extract the source and media bias rating
-        try:
-            source_name = news_item.find("span").text
-        except Exception:
-            source_name = "Unknown"
-
-        try:
-            media_bias_rating = (
-                news_item.find("img")
-                .get("alt")
-                .replace("AllSides Media Bias Rating: ", "")
-                .lower()
-            )
-        except Exception:
-            media_bias_rating = "Unknown"
-
-        # Build the JSON
-        data = {
-            "article_link": article_link,
-            "article_title": article_title,
-            "article_summary": article_summary,
-            "source_name": source_name,
-            "media_bias_rating": media_bias_rating,
-        }
-
-        datas.append(data)
-
-    return datas
-
+    links = apnews.fetchAndDigest(soup)
+    transform_links(links)

 def handle_pagination(soup):
    next_page = soup.find("a", {"rel": "next"})
@ -72,7 +84,8 @@ def handle_pagination(soup):


 def main():
-    url = "https://www.allsides.com/unbiased-balanced-news"
+    global news_type
+    url = "https://apnews.com/" + news_type
    all_data = []

    while url:
@ -102,37 +115,25 @@ def main():
    """

    # Create a nice HTML view of all the articles each one in its own page
-    html = "<html><head><title>AllSides Unbiased News</title>"
+    html = "<html><head><title>APNews Unbiased News</title>"
    html += "<style>" + css + "</style>"
    html += "</head><body>"
    for item in all_data:
        html += "<h1>" + item["article_title"] + "</h1>"
-        html += "<h2>" + item["source_name"] + "</h2>"
-        html += "<h3>" + item["media_bias_rating"] + "</h3>"
        html += "<p>" + item["article_summary"] + "</p>"
        html += "<a href='" + item["article_link"] + "'>Read the full article</a>"
        html += "<hr>"
    html += "</body></html>"
-    with open("allsides.html", "w+") as f:
+    with open("ap.html", "w+") as f:
        f.write(html)

    # Archiving (skip if causes errors)
    os.system("./archiver.sh")

    print("Total articles: ", len(all_data))
-    # Do some math to find the number of articles per bias rating
-    bias_ratings = {}
-    for item in all_data:
-        if item["media_bias_rating"] in bias_ratings:
-            bias_ratings[item["media_bias_rating"]] += 1
-        else:
-            bias_ratings[item["media_bias_rating"]] = 1
-    # Assign percentages
-    for key in bias_ratings:
-        bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
-
-    print(bias_ratings)


 if __name__ == "__main__":
+    menu()
+    print("[+] News type: " + news_type)
    main()
--- a/summarizer.py
+++ b/summarizer.py
@ -1,46 +1,36 @@
-import os
-
 import requests
-from dotenv import load_dotenv

-load_dotenv()
+def summarize(link, pplx_api_key, model):
+    headers = {
+        "accept": "application/json",
+        "authorization": "Bearer " + pplx_api_key,
+        "content-type": "application/json",
+    }

-pplx_api_key = os.getenv("PPLX_API_KEY")
-model = os.getenv("MODEL")
+    json_data = {
+        "model": model,
+        "messages": [
+            {
+                "role": "system",
+                "content": "Be precise, concise and clear. Also proofread what you write and make sure not to hallucinate.",
+            },
+            {
+                "role": "user",
+                "content": "Read and summarize: " + link,
+            },
+        ],
+    }

-with open("link", "r") as f:
-    article_link = f.read().strip()
+    response = requests.post(
+        "https://api.perplexity.ai/chat/completions",
+        headers=headers,
+        json=json_data,
+        timeout=5,
+    )

-headers = {
-    "accept": "application/json",
-    "authorization": "Bearer " + pplx_api_key,
-    "content-type": "application/json",
-}
-
-json_data = {
-    "model": model,
-    "messages": [
-        {
-            "role": "system",
-            "content": "Be precise, concise and clear",
-        },
-        {
-            "role": "user",
-            "content": "Search and summarize: " + article_link,
-        },
-    ],
-}
-
-response = requests.post(
-    "https://api.perplexity.ai/chat/completions",
-    headers=headers,
-    json=json_data,
-    timeout=5,
-)
-
-response = response.json()
-# print(response)
-
-# print(response["choices"][0]["message"]["content"])
-with open("response", "w+") as response_file:
-    response_file.write(response["choices"][0]["message"]["content"])
+    response = response.json()
+    # print(response)
+    try:
+        return response["choices"][0]["message"]["content"]
+    except Exception as e:
+        return "Error: " + str(e)