first commit

2025-07-29 05:51:54 +00:00 · 2024-01-12 20:34:07 +01:00 · 2024-01-12 20:34:07 +01:00 · b6abb2a75e
commit b6abb2a75e
7 changed files with 210 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,2 @@
 PPLX_API_KEY="your perplexity ai key"
 MODEL="pplx-7b-chat"
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,7 @@
 link
 allsides.html
 test.html
 response
 models/
 news/
 .env
--- a/README.md
+++ b/README.md
@ -0,0 +1,28 @@
 # MySides
 Your trustworthy unbiased news scraper.
 ## Disclaimer
 This tool is made for personal use, and should be used carefully. Being a scraper for (AllSides)[https://allsides.com], all the material downloaded, used and reworked by this software is property of AllSides.
 This tool is intended to be used to quickly grasp an overview of the daily news.
 Please check AllSides ToS for more information.
 ## TLDR
 MySides scrape AllSides for the latest news and uses Perplexity AI APIs to summarize them in a nice, single page.
 ## Perplexity AI?
 Personally, I find their API pricing way better than OpenAI ones. If you are a premium user, you get also 5$ per month of credits for the APIs which is more than enough to run this program daily.
 ## Install
    git clone https://github.com/tcsenpai/mysides
    cd mysides
    chmod +x install.sh && ./install.sh
 ## Run
    python main.py
--- a/install.sh
+++ b/install.sh
@ -0,0 +1,8 @@
 #!/bin/bash
 pip install -r requirements.txt
 mkdir news
 cp .env.example .env
 echo "You should now open your .env file and insert your Perplexity API Key."
 echo "You can get one at: https://www.perplexity.ai/settings/api"
 echo "Then, launch main.py and wait for it to finish."
 echo "allsides.html contains an overview of all the news."
--- a/main.py
+++ b/main.py
@ -0,0 +1,121 @@
 import requests
 from bs4 import BeautifulSoup
 import os
 def extract_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    news_items = soup.find_all('div', class_='news-item')
    datas = []
    tot_articles = len(news_items)
    print("[+] Total news: " + str(tot_articles))
    print("[+] Filtering out invalid articles...")
    counter = 0
    for news_item in news_items:
        # Extract the article link and title
        article_link = news_item.find_all('a')[0].get('href')
        if not "allsides.com" in article_link:
            tot_articles -= 1
            continue
        counter += 1        
        print("[+] Processing news: " + str(counter) + "/" + str(tot_articles) )
        article_title = news_item.find('div', class_="news-title").text.strip()
        print("[*] Summarizing: " + article_link)
        # Summarize the article
        with open("link", "w+") as f:
            f.write(article_link)
        os.system("python summarizer.py")
        print("[OK] Done. Proceeding...")
        with open("response", "r") as f:
            article_summary = f.read().strip()
        #with open(article_title, "w+") as f:
             #f.write(article_summary)
        # Extract the source and media bias rating
        try:
            source_name = news_item.find('span').text
        except:
            source_name = "Unknown"
        try:           
            media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower()
        except:
            media_bias_rating = "Unknown"
        # Build the JSON
        data = {
            'article_link': article_link,
            'article_title': article_title,
            'article_summary': article_summary,
            'source_name': source_name,
            'media_bias_rating': media_bias_rating
        }
        datas.append(data)
    return datas
 def handle_pagination(soup):
    next_page = soup.find('a', {'rel': 'next'})
    if next_page:
        return next_page['href']
    return None
 def main():
    url = "https://www.allsides.com/unbiased-balanced-news"
    all_data = []
    while url:
        data = extract_data(url)
        all_data.extend(data)
        url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser'))
    # Prepare a nice CSS for the viewing page (nice and clean)
    css = """
    body {
        font-family: sans-serif (Helvetica, Arial);
    }
    h1 {
        font-size: 2em;
    }
    h2 {
        font-size: 1.5em;
    }
    h3 {
        font-size: 1.2em;
    }
    p {
        font-size: 1em;
    }
    """
    # Create a nice HTML view of all the articles each one in its own page
    html = "<html><head><title>AllSides Unbiased News</title>"
    html += "<style>" + css + "</style>"
    html += "</head><body>"
    for item in all_data:
        html += "<h1>" + item['article_title'] + "</h1>"
        html += "<h2>" + item['source_name'] + "</h2>"
        html += "<h3>" + item['media_bias_rating'] + "</h3>"
        html += "<p>" + item['article_summary'] + "</p>"
        html += "<a href='" + item['article_link'] + "'>Read the full article</a>"
        html += "<hr>"
    html += "</body></html>"
    with open("allsides.html", "w+") as f:
        f.write(html)
    print("Total articles: ", len(all_data))
    # Do some math to find the number of articles per bias rating
    bias_ratings = {}
    for item in all_data:
        if item['media_bias_rating'] in bias_ratings:
            bias_ratings[item['media_bias_rating']] += 1
        else:
            bias_ratings[item['media_bias_rating']] = 1
    # Assign percentages
    for key in bias_ratings:
        bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
    print(bias_ratings)
 if __name__ == "__main__":
    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
 bs4
 requests
 python-dotenv
--- a/summarizer.py
+++ b/summarizer.py
@ -0,0 +1,41 @@
 import requests
 from dotenv import load_dotenv
 import os
 load_dotenv()
 pplx_api_key = os.getenv("PPLX_API_KEY")
 model = os.getenv("MODEL")
 with open("link", "r") as f:
    article_link = f.read().strip()
 headers = {
    'accept': 'application/json',
    'authorization': 'Bearer ' + pplx_api_key,
    'content-type': 'application/json',
 }
 json_data = {
    'model': model,
    'messages': [
        {
            'role': 'system',
            'content': 'Be precise, concise and clear',
        },
        {
            'role': 'user',
            'content': 'Search and summarize: ' + article_link,
        },
    ],
 }
 response = requests.post('https://api.perplexity.ai/chat/completions', headers=headers, json=json_data)
 response = response.json()
 #print(response)
 #print(response["choices"][0]["message"]["content"])
 with open("response", "w+") as response_file:
    response_file.write(response["choices"][0]["message"]["content"])
		`@ -0,0 +1,2 @@`
							`PPLX_API_KEY="your perplexity ai key"`
							`MODEL="pplx-7b-chat"`