formatted and better coded

This commit is contained in:
thecookingsenpai 2024-01-12 20:54:35 +01:00
parent b6abb2a75e
commit 4fe0d8e61d
10 changed files with 136 additions and 52 deletions

9
.trunk/.gitignore vendored Normal file
View File

@ -0,0 +1,9 @@
*out
*logs
*actions
*notifications
*tools
plugins
user_trunk.yaml
user.yaml
tmp

View File

@ -0,0 +1,2 @@
[settings]
profile=black

View File

@ -0,0 +1,10 @@
# Autoformatter friendly markdownlint config (all formatting rules disabled)
default: true
blank_lines: false
bullet: false
html: false
indentation: false
line_length: false
spaces: false
url: false
whitespace: false

View File

@ -0,0 +1,7 @@
enable=all
source-path=SCRIPTDIR
disable=SC2154
# If you're having issues with shellcheck following source, disable the errors via:
# disable=SC1090
# disable=SC1091

5
.trunk/configs/ruff.toml Normal file
View File

@ -0,0 +1,5 @@
# Generic, formatter-friendly config.
select = ["B", "D3", "E", "F"]
# Never enforce `E501` (line length violations). This should be handled by formatters.
ignore = ["E501"]

31
.trunk/trunk.yaml Normal file
View File

@ -0,0 +1,31 @@
# This file controls the behavior of Trunk: https://docs.trunk.io/cli
# To learn more about the format of this file, see https://docs.trunk.io/reference/trunk-yaml
version: 0.1
cli:
version: 1.19.0
# Trunk provides extensibility via plugins. (https://docs.trunk.io/plugins)
plugins:
sources:
- id: trunk
ref: v1.4.2
uri: https://github.com/trunk-io/plugins
# Many linters and tools depend on runtimes - configure them here. (https://docs.trunk.io/runtimes)
runtimes:
enabled:
- go@1.21.0
- node@18.12.1
- python@3.10.8
# This is the section where you manage your linters. (https://docs.trunk.io/check/configuration)
lint:
enabled:
- bandit@1.7.6
- black@23.12.1
- git-diff-check
- isort@5.13.2
- markdownlint@0.38.0
- osv-scanner@1.5.0
- prettier@3.1.1
- ruff@0.1.12
- shellcheck@0.9.0
- shfmt@3.6.0
- trufflehog@3.63.8

View File

@ -4,7 +4,7 @@ Your trustworthy unbiased news scraper.
## Disclaimer ## Disclaimer
This tool is made for personal use, and should be used carefully. Being a scraper for (AllSides)[https://allsides.com], all the material downloaded, used and reworked by this software is property of AllSides. This tool is made for personal use, and should be used carefully. Being a scraper for [AllSides](https://allsides.com), all the material downloaded, used and reworked by this software is property of AllSides.
This tool is intended to be used to quickly grasp an overview of the daily news. This tool is intended to be used to quickly grasp an overview of the daily news.
Please check AllSides ToS for more information. Please check AllSides ToS for more information.

View File

@ -1,4 +1,5 @@
#!/bin/bash #!/bin/bash
pip install -r requirements.txt pip install -r requirements.txt
mkdir news mkdir news
cp .env.example .env cp .env.example .env

70
main.py
View File

@ -1,11 +1,13 @@
import requests
from bs4 import BeautifulSoup
import os import os
import requests
from bs4 import BeautifulSoup
def extract_data(url): def extract_data(url):
response = requests.get(url) response = requests.get(url, timeout=5)
soup = BeautifulSoup(response.text, 'html.parser') soup = BeautifulSoup(response.text, "html.parser")
news_items = soup.find_all('div', class_='news-item') news_items = soup.find_all("div", class_="news-item")
datas = [] datas = []
tot_articles = len(news_items) tot_articles = len(news_items)
print("[+] Total news: " + str(tot_articles)) print("[+] Total news: " + str(tot_articles))
@ -13,17 +15,19 @@ def extract_data(url):
counter = 0 counter = 0
for news_item in news_items: for news_item in news_items:
# Extract the article link and title # Extract the article link and title
article_link = news_item.find_all('a')[0].get('href') article_link = news_item.find_all("a")[0].get("href")
if not "allsides.com" in article_link: if "allsides.com" not in article_link:
tot_articles -= 1 tot_articles -= 1
continue continue
counter += 1 counter += 1
print("[+] Processing news: " + str(counter) + "/" + str(tot_articles)) print("[+] Processing news: " + str(counter) + "/" + str(tot_articles))
article_title = news_item.find('div', class_="news-title").text.strip() article_title = news_item.find("div", class_="news-title").text.strip()
print("[*] Summarizing: " + article_link) print("[*] Summarizing: " + article_link)
# Summarize the article # Summarize the article
with open("link", "w+") as f: with open("link", "w+") as f:
f.write(article_link) f.write(article_link)
# trunk-ignore(bandit/B605)
# trunk-ignore(bandit/B607)
os.system("python summarizer.py") os.system("python summarizer.py")
print("[OK] Done. Proceeding...") print("[OK] Done. Proceeding...")
with open("response", "r") as f: with open("response", "r") as f:
@ -32,34 +36,41 @@ def extract_data(url):
# f.write(article_summary) # f.write(article_summary)
# Extract the source and media bias rating # Extract the source and media bias rating
try: try:
source_name = news_item.find('span').text source_name = news_item.find("span").text
except: except Exception:
source_name = "Unknown" source_name = "Unknown"
try: try:
media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower() media_bias_rating = (
except: news_item.find("img")
.get("alt")
.replace("AllSides Media Bias Rating: ", "")
.lower()
)
except Exception:
media_bias_rating = "Unknown" media_bias_rating = "Unknown"
# Build the JSON # Build the JSON
data = { data = {
'article_link': article_link, "article_link": article_link,
'article_title': article_title, "article_title": article_title,
'article_summary': article_summary, "article_summary": article_summary,
'source_name': source_name, "source_name": source_name,
'media_bias_rating': media_bias_rating "media_bias_rating": media_bias_rating,
} }
datas.append(data) datas.append(data)
return datas return datas
def handle_pagination(soup): def handle_pagination(soup):
next_page = soup.find('a', {'rel': 'next'}) next_page = soup.find("a", {"rel": "next"})
if next_page: if next_page:
return next_page['href'] return next_page["href"]
return None return None
def main(): def main():
url = "https://www.allsides.com/unbiased-balanced-news" url = "https://www.allsides.com/unbiased-balanced-news"
all_data = [] all_data = []
@ -67,7 +78,9 @@ def main():
while url: while url:
data = extract_data(url) data = extract_data(url)
all_data.extend(data) all_data.extend(data)
url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser')) url = handle_pagination(
BeautifulSoup(requests.get(url, timeout=5).text, "html.parser")
)
# Prepare a nice CSS for the viewing page (nice and clean) # Prepare a nice CSS for the viewing page (nice and clean)
css = """ css = """
@ -93,11 +106,11 @@ def main():
html += "<style>" + css + "</style>" html += "<style>" + css + "</style>"
html += "</head><body>" html += "</head><body>"
for item in all_data: for item in all_data:
html += "<h1>" + item['article_title'] + "</h1>" html += "<h1>" + item["article_title"] + "</h1>"
html += "<h2>" + item['source_name'] + "</h2>" html += "<h2>" + item["source_name"] + "</h2>"
html += "<h3>" + item['media_bias_rating'] + "</h3>" html += "<h3>" + item["media_bias_rating"] + "</h3>"
html += "<p>" + item['article_summary'] + "</p>" html += "<p>" + item["article_summary"] + "</p>"
html += "<a href='" + item['article_link'] + "'>Read the full article</a>" html += "<a href='" + item["article_link"] + "'>Read the full article</a>"
html += "<hr>" html += "<hr>"
html += "</body></html>" html += "</body></html>"
with open("allsides.html", "w+") as f: with open("allsides.html", "w+") as f:
@ -107,15 +120,16 @@ def main():
# Do some math to find the number of articles per bias rating # Do some math to find the number of articles per bias rating
bias_ratings = {} bias_ratings = {}
for item in all_data: for item in all_data:
if item['media_bias_rating'] in bias_ratings: if item["media_bias_rating"] in bias_ratings:
bias_ratings[item['media_bias_rating']] += 1 bias_ratings[item["media_bias_rating"]] += 1
else: else:
bias_ratings[item['media_bias_rating']] = 1 bias_ratings[item["media_bias_rating"]] = 1
# Assign percentages # Assign percentages
for key in bias_ratings: for key in bias_ratings:
bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2) bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
print(bias_ratings) print(bias_ratings)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -1,6 +1,7 @@
import os
import requests import requests
from dotenv import load_dotenv from dotenv import load_dotenv
import os
load_dotenv() load_dotenv()
@ -10,28 +11,32 @@ model = os.getenv("MODEL")
with open("link", "r") as f: with open("link", "r") as f:
article_link = f.read().strip() article_link = f.read().strip()
headers = { headers = {
'accept': 'application/json', "accept": "application/json",
'authorization': 'Bearer ' + pplx_api_key, "authorization": "Bearer " + pplx_api_key,
'content-type': 'application/json', "content-type": "application/json",
} }
json_data = { json_data = {
'model': model, "model": model,
'messages': [ "messages": [
{ {
'role': 'system', "role": "system",
'content': 'Be precise, concise and clear', "content": "Be precise, concise and clear",
}, },
{ {
'role': 'user', "role": "user",
'content': 'Search and summarize: ' + article_link, "content": "Search and summarize: " + article_link,
}, },
], ],
} }
response = requests.post('https://api.perplexity.ai/chat/completions', headers=headers, json=json_data) response = requests.post(
"https://api.perplexity.ai/chat/completions",
headers=headers,
json=json_data,
timeout=5,
)
response = response.json() response = response.json()
# print(response) # print(response)