From b6abb2a75e7291722b67f6bcddb2148b226cf3a4 Mon Sep 17 00:00:00 2001
From: thecookingsenpai <elmastrococinero@gmail.com>
Date: Fri, 12 Jan 2024 20:34:07 +0100
Subject: [PATCH] first commit

---
 .env.example     |   2 +
 .gitignore       |   7 +++
 README.md        |  28 +++++++++++
 install.sh       |   8 ++++
 main.py          | 121 +++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |   3 ++
 summarizer.py    |  41 ++++++++++++++++
 7 files changed, 210 insertions(+)
 create mode 100644 .env.example
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 install.sh
 create mode 100644 main.py
 create mode 100644 requirements.txt
 create mode 100644 summarizer.py

diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..63e7bc2
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,2 @@
+PPLX_API_KEY="your perplexity ai key"
+MODEL="pplx-7b-chat"
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e98d8f5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+link
+allsides.html
+test.html
+response
+models/
+news/
+.env
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..cd57fce
--- /dev/null
+++ b/README.md
@@ -0,0 +1,28 @@
+# MySides
+
+Your trustworthy unbiased news scraper.
+
+## Disclaimer
+
+This tool is made for personal use, and should be used carefully. Being a scraper for (AllSides)[https://allsides.com], all the material downloaded, used and reworked by this software is property of AllSides.
+
+This tool is intended to be used to quickly grasp an overview of the daily news.
+Please check AllSides ToS for more information.
+
+## TLDR
+
+MySides scrape AllSides for the latest news and uses Perplexity AI APIs to summarize them in a nice, single page.
+
+## Perplexity AI?
+
+Personally, I find their API pricing way better than OpenAI ones. If you are a premium user, you get also 5$ per month of credits for the APIs which is more than enough to run this program daily.
+
+## Install
+
+    git clone https://github.com/tcsenpai/mysides
+    cd mysides
+    chmod +x install.sh && ./install.sh
+
+## Run
+
+    python main.py
\ No newline at end of file
diff --git a/install.sh b/install.sh
new file mode 100644
index 0000000..e180c9f
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+pip install -r requirements.txt
+mkdir news
+cp .env.example .env
+echo "You should now open your .env file and insert your Perplexity API Key."
+echo "You can get one at: https://www.perplexity.ai/settings/api"
+echo "Then, launch main.py and wait for it to finish."
+echo "allsides.html contains an overview of all the news."
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..3afb53d
--- /dev/null
+++ b/main.py
@@ -0,0 +1,121 @@
+import requests
+from bs4 import BeautifulSoup
+import os
+
+def extract_data(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    news_items = soup.find_all('div', class_='news-item')
+    datas = []
+    tot_articles = len(news_items)
+    print("[+] Total news: " + str(tot_articles))
+    print("[+] Filtering out invalid articles...")
+    counter = 0
+    for news_item in news_items:
+        # Extract the article link and title
+        article_link = news_item.find_all('a')[0].get('href')
+        if not "allsides.com" in article_link:
+            tot_articles -= 1
+            continue
+        counter += 1        
+        print("[+] Processing news: " + str(counter) + "/" + str(tot_articles) )
+        article_title = news_item.find('div', class_="news-title").text.strip()
+        print("[*] Summarizing: " + article_link)
+        # Summarize the article
+        with open("link", "w+") as f:
+            f.write(article_link)
+        os.system("python summarizer.py")
+        print("[OK] Done. Proceeding...")
+        with open("response", "r") as f:
+            article_summary = f.read().strip()
+        #with open(article_title, "w+") as f:
+             #f.write(article_summary)
+        # Extract the source and media bias rating
+        try:
+            source_name = news_item.find('span').text
+        except:
+            source_name = "Unknown"
+
+        try:           
+            media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower()
+        except:
+            media_bias_rating = "Unknown"
+
+        # Build the JSON
+        data = {
+            'article_link': article_link,
+            'article_title': article_title,
+            'article_summary': article_summary,
+            'source_name': source_name,
+            'media_bias_rating': media_bias_rating
+        }
+
+        datas.append(data)
+
+    return datas
+
+def handle_pagination(soup):
+    next_page = soup.find('a', {'rel': 'next'})
+    if next_page:
+        return next_page['href']
+    return None
+
+def main():
+    url = "https://www.allsides.com/unbiased-balanced-news"
+    all_data = []
+
+    while url:
+        data = extract_data(url)
+        all_data.extend(data)
+        url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser'))
+
+    # Prepare a nice CSS for the viewing page (nice and clean)
+    css = """
+    body {
+        font-family: sans-serif (Helvetica, Arial);
+    }
+    h1 {
+        font-size: 2em;
+    }
+    h2 {
+        font-size: 1.5em;
+    }
+    h3 {
+        font-size: 1.2em;
+    }
+    p {
+        font-size: 1em;
+    }
+    """
+
+    # Create a nice HTML view of all the articles each one in its own page
+    html = "<html><head><title>AllSides Unbiased News</title>"
+    html += "<style>" + css + "</style>"
+    html += "</head><body>"
+    for item in all_data:
+        html += "<h1>" + item['article_title'] + "</h1>"
+        html += "<h2>" + item['source_name'] + "</h2>"
+        html += "<h3>" + item['media_bias_rating'] + "</h3>"
+        html += "<p>" + item['article_summary'] + "</p>"
+        html += "<a href='" + item['article_link'] + "'>Read the full article</a>"
+        html += "<hr>"
+    html += "</body></html>"
+    with open("allsides.html", "w+") as f:
+        f.write(html)
+
+    print("Total articles: ", len(all_data))
+    # Do some math to find the number of articles per bias rating
+    bias_ratings = {}
+    for item in all_data:
+        if item['media_bias_rating'] in bias_ratings:
+            bias_ratings[item['media_bias_rating']] += 1
+        else:
+            bias_ratings[item['media_bias_rating']] = 1
+    # Assign percentages
+    for key in bias_ratings:
+        bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
+        
+    print(bias_ratings)
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..efd35dc
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+bs4
+requests
+python-dotenv
\ No newline at end of file
diff --git a/summarizer.py b/summarizer.py
new file mode 100644
index 0000000..d6c32f0
--- /dev/null
+++ b/summarizer.py
@@ -0,0 +1,41 @@
+import requests
+from dotenv import load_dotenv
+import os
+
+load_dotenv()
+
+pplx_api_key = os.getenv("PPLX_API_KEY")
+model = os.getenv("MODEL")
+
+with open("link", "r") as f:
+    article_link = f.read().strip()
+
+
+headers = {
+    'accept': 'application/json',
+    'authorization': 'Bearer ' + pplx_api_key,
+    'content-type': 'application/json',
+}
+
+json_data = {
+    'model': model,
+    'messages': [
+        {
+            'role': 'system',
+            'content': 'Be precise, concise and clear',
+        },
+        {
+            'role': 'user',
+            'content': 'Search and summarize: ' + article_link,
+        },
+    ],
+}
+
+response = requests.post('https://api.perplexity.ai/chat/completions', headers=headers, json=json_data)
+
+response = response.json()
+#print(response)
+
+#print(response["choices"][0]["message"]["content"])
+with open("response", "w+") as response_file:
+    response_file.write(response["choices"][0]["message"]["content"])
\ No newline at end of file