mirror of
https://github.com/tcsenpai/mysides.git
synced 2025-06-06 02:55:21 +00:00
122 lines
3.8 KiB
Python
122 lines
3.8 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import os
|
|
|
|
def extract_data(url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
news_items = soup.find_all('div', class_='news-item')
|
|
datas = []
|
|
tot_articles = len(news_items)
|
|
print("[+] Total news: " + str(tot_articles))
|
|
print("[+] Filtering out invalid articles...")
|
|
counter = 0
|
|
for news_item in news_items:
|
|
# Extract the article link and title
|
|
article_link = news_item.find_all('a')[0].get('href')
|
|
if not "allsides.com" in article_link:
|
|
tot_articles -= 1
|
|
continue
|
|
counter += 1
|
|
print("[+] Processing news: " + str(counter) + "/" + str(tot_articles) )
|
|
article_title = news_item.find('div', class_="news-title").text.strip()
|
|
print("[*] Summarizing: " + article_link)
|
|
# Summarize the article
|
|
with open("link", "w+") as f:
|
|
f.write(article_link)
|
|
os.system("python summarizer.py")
|
|
print("[OK] Done. Proceeding...")
|
|
with open("response", "r") as f:
|
|
article_summary = f.read().strip()
|
|
#with open(article_title, "w+") as f:
|
|
#f.write(article_summary)
|
|
# Extract the source and media bias rating
|
|
try:
|
|
source_name = news_item.find('span').text
|
|
except:
|
|
source_name = "Unknown"
|
|
|
|
try:
|
|
media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower()
|
|
except:
|
|
media_bias_rating = "Unknown"
|
|
|
|
# Build the JSON
|
|
data = {
|
|
'article_link': article_link,
|
|
'article_title': article_title,
|
|
'article_summary': article_summary,
|
|
'source_name': source_name,
|
|
'media_bias_rating': media_bias_rating
|
|
}
|
|
|
|
datas.append(data)
|
|
|
|
return datas
|
|
|
|
def handle_pagination(soup):
|
|
next_page = soup.find('a', {'rel': 'next'})
|
|
if next_page:
|
|
return next_page['href']
|
|
return None
|
|
|
|
def main():
|
|
url = "https://www.allsides.com/unbiased-balanced-news"
|
|
all_data = []
|
|
|
|
while url:
|
|
data = extract_data(url)
|
|
all_data.extend(data)
|
|
url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser'))
|
|
|
|
# Prepare a nice CSS for the viewing page (nice and clean)
|
|
css = """
|
|
body {
|
|
font-family: sans-serif (Helvetica, Arial);
|
|
}
|
|
h1 {
|
|
font-size: 2em;
|
|
}
|
|
h2 {
|
|
font-size: 1.5em;
|
|
}
|
|
h3 {
|
|
font-size: 1.2em;
|
|
}
|
|
p {
|
|
font-size: 1em;
|
|
}
|
|
"""
|
|
|
|
# Create a nice HTML view of all the articles each one in its own page
|
|
html = "<html><head><title>AllSides Unbiased News</title>"
|
|
html += "<style>" + css + "</style>"
|
|
html += "</head><body>"
|
|
for item in all_data:
|
|
html += "<h1>" + item['article_title'] + "</h1>"
|
|
html += "<h2>" + item['source_name'] + "</h2>"
|
|
html += "<h3>" + item['media_bias_rating'] + "</h3>"
|
|
html += "<p>" + item['article_summary'] + "</p>"
|
|
html += "<a href='" + item['article_link'] + "'>Read the full article</a>"
|
|
html += "<hr>"
|
|
html += "</body></html>"
|
|
with open("allsides.html", "w+") as f:
|
|
f.write(html)
|
|
|
|
print("Total articles: ", len(all_data))
|
|
# Do some math to find the number of articles per bias rating
|
|
bias_ratings = {}
|
|
for item in all_data:
|
|
if item['media_bias_rating'] in bias_ratings:
|
|
bias_ratings[item['media_bias_rating']] += 1
|
|
else:
|
|
bias_ratings[item['media_bias_rating']] = 1
|
|
# Assign percentages
|
|
for key in bias_ratings:
|
|
bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
|
|
|
|
print(bias_ratings)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|