mysides/main.py

import requests
from bs4 import BeautifulSoup
import os

def extract_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    news_items = soup.find_all('div', class_='news-item')
    datas = []
    tot_articles = len(news_items)
    print("[+] Total news: " + str(tot_articles))
    print("[+] Filtering out invalid articles...")
    counter = 0
    for news_item in news_items:
        # Extract the article link and title
        article_link = news_item.find_all('a')[0].get('href')
        if not "allsides.com" in article_link:
            tot_articles -= 1
            continue
        counter += 1
        print("[+] Processing news: " + str(counter) + "/" + str(tot_articles) )
        article_title = news_item.find('div', class_="news-title").text.strip()
        print("[*] Summarizing: " + article_link)
        # Summarize the article
        with open("link", "w+") as f:
            f.write(article_link)
        os.system("python summarizer.py")
        print("[OK] Done. Proceeding...")
        with open("response", "r") as f:
            article_summary = f.read().strip()
        #with open(article_title, "w+") as f:
             #f.write(article_summary)
        # Extract the source and media bias rating
        try:
            source_name = news_item.find('span').text
        except:
            source_name = "Unknown"

        try:
            media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower()
        except:
            media_bias_rating = "Unknown"

        # Build the JSON
        data = {
            'article_link': article_link,
            'article_title': article_title,
            'article_summary': article_summary,
            'source_name': source_name,
            'media_bias_rating': media_bias_rating
        }

        datas.append(data)

    return datas

def handle_pagination(soup):
    next_page = soup.find('a', {'rel': 'next'})
    if next_page:
        return next_page['href']
    return None

def main():
    url = "https://www.allsides.com/unbiased-balanced-news"
    all_data = []

    while url:
        data = extract_data(url)
        all_data.extend(data)
        url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser'))

    # Prepare a nice CSS for the viewing page (nice and clean)
    css = """
    body {
        font-family: sans-serif (Helvetica, Arial);
    }
    h1 {
        font-size: 2em;
    }
    h2 {
        font-size: 1.5em;
    }
    h3 {
        font-size: 1.2em;
    }
    p {
        font-size: 1em;
    }
    """

    # Create a nice HTML view of all the articles each one in its own page
    html = "<html><head><title>AllSides Unbiased News</title>"
    html += "<style>" + css + "</style>"
    html += "</head><body>"
    for item in all_data:
        html += "<h1>" + item['article_title'] + "</h1>"
        html += "<h2>" + item['source_name'] + "</h2>"
        html += "<h3>" + item['media_bias_rating'] + "</h3>"
        html += "<p>" + item['article_summary'] + "</p>"
        html += "<a href='" + item['article_link'] + "'>Read the full article</a>"
        html += "<hr>"
    html += "</body></html>"
    with open("allsides.html", "w+") as f:
        f.write(html)

    print("Total articles: ", len(all_data))
    # Do some math to find the number of articles per bias rating
    bias_ratings = {}
    for item in all_data:
        if item['media_bias_rating'] in bias_ratings:
            bias_ratings[item['media_bias_rating']] += 1
        else:
            bias_ratings[item['media_bias_rating']] = 1
    # Assign percentages
    for key in bias_ratings:
        bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)

    print(bias_ratings)

if __name__ == "__main__":
    main()