formatted and better coded

This commit is contained in:
thecookingsenpai 2024-01-12 20:54:35 +01:00
parent b6abb2a75e
commit 4fe0d8e61d
10 changed files with 136 additions and 52 deletions

9
.trunk/.gitignore vendored Normal file
View File

@ -0,0 +1,9 @@
*out
*logs
*actions
*notifications
*tools
plugins
user_trunk.yaml
user.yaml
tmp

View File

@ -0,0 +1,2 @@
[settings]
profile=black

View File

@ -0,0 +1,10 @@
# Autoformatter friendly markdownlint config (all formatting rules disabled)
default: true
blank_lines: false
bullet: false
html: false
indentation: false
line_length: false
spaces: false
url: false
whitespace: false

View File

@ -0,0 +1,7 @@
enable=all
source-path=SCRIPTDIR
disable=SC2154
# If you're having issues with shellcheck following source, disable the errors via:
# disable=SC1090
# disable=SC1091

5
.trunk/configs/ruff.toml Normal file
View File

@ -0,0 +1,5 @@
# Generic, formatter-friendly config.
select = ["B", "D3", "E", "F"]
# Never enforce `E501` (line length violations). This should be handled by formatters.
ignore = ["E501"]

31
.trunk/trunk.yaml Normal file
View File

@ -0,0 +1,31 @@
# This file controls the behavior of Trunk: https://docs.trunk.io/cli
# To learn more about the format of this file, see https://docs.trunk.io/reference/trunk-yaml
version: 0.1
cli:
version: 1.19.0
# Trunk provides extensibility via plugins. (https://docs.trunk.io/plugins)
plugins:
sources:
- id: trunk
ref: v1.4.2
uri: https://github.com/trunk-io/plugins
# Many linters and tools depend on runtimes - configure them here. (https://docs.trunk.io/runtimes)
runtimes:
enabled:
- go@1.21.0
- node@18.12.1
- python@3.10.8
# This is the section where you manage your linters. (https://docs.trunk.io/check/configuration)
lint:
enabled:
- bandit@1.7.6
- black@23.12.1
- git-diff-check
- isort@5.13.2
- markdownlint@0.38.0
- osv-scanner@1.5.0
- prettier@3.1.1
- ruff@0.1.12
- shellcheck@0.9.0
- shfmt@3.6.0
- trufflehog@3.63.8

View File

@ -4,7 +4,7 @@ Your trustworthy unbiased news scraper.
## Disclaimer
This tool is made for personal use, and should be used carefully. Being a scraper for (AllSides)[https://allsides.com], all the material downloaded, used and reworked by this software is property of AllSides.
This tool is made for personal use, and should be used carefully. Being a scraper for [AllSides](https://allsides.com), all the material downloaded, used and reworked by this software is property of AllSides.
This tool is intended to be used to quickly grasp an overview of the daily news.
Please check AllSides ToS for more information.
@ -25,4 +25,4 @@ Personally, I find their API pricing way better than OpenAI ones. If you are a p
## Run
python main.py
python main.py

View File

@ -1,8 +1,9 @@
#!/bin/bash
pip install -r requirements.txt
mkdir news
cp .env.example .env
echo "You should now open your .env file and insert your Perplexity API Key."
echo "You can get one at: https://www.perplexity.ai/settings/api"
echo "Then, launch main.py and wait for it to finish."
echo "allsides.html contains an overview of all the news."
echo "allsides.html contains an overview of all the news."

82
main.py
View File

@ -1,11 +1,13 @@
import requests
from bs4 import BeautifulSoup
import os
import requests
from bs4 import BeautifulSoup
def extract_data(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
news_items = soup.find_all('div', class_='news-item')
response = requests.get(url, timeout=5)
soup = BeautifulSoup(response.text, "html.parser")
news_items = soup.find_all("div", class_="news-item")
datas = []
tot_articles = len(news_items)
print("[+] Total news: " + str(tot_articles))
@ -13,53 +15,62 @@ def extract_data(url):
counter = 0
for news_item in news_items:
# Extract the article link and title
article_link = news_item.find_all('a')[0].get('href')
if not "allsides.com" in article_link:
article_link = news_item.find_all("a")[0].get("href")
if "allsides.com" not in article_link:
tot_articles -= 1
continue
counter += 1
print("[+] Processing news: " + str(counter) + "/" + str(tot_articles) )
article_title = news_item.find('div', class_="news-title").text.strip()
counter += 1
print("[+] Processing news: " + str(counter) + "/" + str(tot_articles))
article_title = news_item.find("div", class_="news-title").text.strip()
print("[*] Summarizing: " + article_link)
# Summarize the article
with open("link", "w+") as f:
f.write(article_link)
# trunk-ignore(bandit/B605)
# trunk-ignore(bandit/B607)
os.system("python summarizer.py")
print("[OK] Done. Proceeding...")
with open("response", "r") as f:
article_summary = f.read().strip()
#with open(article_title, "w+") as f:
#f.write(article_summary)
# with open(article_title, "w+") as f:
# f.write(article_summary)
# Extract the source and media bias rating
try:
source_name = news_item.find('span').text
except:
source_name = news_item.find("span").text
except Exception:
source_name = "Unknown"
try:
media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower()
except:
try:
media_bias_rating = (
news_item.find("img")
.get("alt")
.replace("AllSides Media Bias Rating: ", "")
.lower()
)
except Exception:
media_bias_rating = "Unknown"
# Build the JSON
data = {
'article_link': article_link,
'article_title': article_title,
'article_summary': article_summary,
'source_name': source_name,
'media_bias_rating': media_bias_rating
"article_link": article_link,
"article_title": article_title,
"article_summary": article_summary,
"source_name": source_name,
"media_bias_rating": media_bias_rating,
}
datas.append(data)
return datas
def handle_pagination(soup):
next_page = soup.find('a', {'rel': 'next'})
next_page = soup.find("a", {"rel": "next"})
if next_page:
return next_page['href']
return next_page["href"]
return None
def main():
url = "https://www.allsides.com/unbiased-balanced-news"
all_data = []
@ -67,7 +78,9 @@ def main():
while url:
data = extract_data(url)
all_data.extend(data)
url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser'))
url = handle_pagination(
BeautifulSoup(requests.get(url, timeout=5).text, "html.parser")
)
# Prepare a nice CSS for the viewing page (nice and clean)
css = """
@ -93,11 +106,11 @@ def main():
html += "<style>" + css + "</style>"
html += "</head><body>"
for item in all_data:
html += "<h1>" + item['article_title'] + "</h1>"
html += "<h2>" + item['source_name'] + "</h2>"
html += "<h3>" + item['media_bias_rating'] + "</h3>"
html += "<p>" + item['article_summary'] + "</p>"
html += "<a href='" + item['article_link'] + "'>Read the full article</a>"
html += "<h1>" + item["article_title"] + "</h1>"
html += "<h2>" + item["source_name"] + "</h2>"
html += "<h3>" + item["media_bias_rating"] + "</h3>"
html += "<p>" + item["article_summary"] + "</p>"
html += "<a href='" + item["article_link"] + "'>Read the full article</a>"
html += "<hr>"
html += "</body></html>"
with open("allsides.html", "w+") as f:
@ -107,15 +120,16 @@ def main():
# Do some math to find the number of articles per bias rating
bias_ratings = {}
for item in all_data:
if item['media_bias_rating'] in bias_ratings:
bias_ratings[item['media_bias_rating']] += 1
if item["media_bias_rating"] in bias_ratings:
bias_ratings[item["media_bias_rating"]] += 1
else:
bias_ratings[item['media_bias_rating']] = 1
bias_ratings[item["media_bias_rating"]] = 1
# Assign percentages
for key in bias_ratings:
bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
print(bias_ratings)
if __name__ == "__main__":
main()

View File

@ -1,6 +1,7 @@
import os
import requests
from dotenv import load_dotenv
import os
load_dotenv()
@ -10,32 +11,36 @@ model = os.getenv("MODEL")
with open("link", "r") as f:
article_link = f.read().strip()
headers = {
'accept': 'application/json',
'authorization': 'Bearer ' + pplx_api_key,
'content-type': 'application/json',
"accept": "application/json",
"authorization": "Bearer " + pplx_api_key,
"content-type": "application/json",
}
json_data = {
'model': model,
'messages': [
"model": model,
"messages": [
{
'role': 'system',
'content': 'Be precise, concise and clear',
"role": "system",
"content": "Be precise, concise and clear",
},
{
'role': 'user',
'content': 'Search and summarize: ' + article_link,
"role": "user",
"content": "Search and summarize: " + article_link,
},
],
}
response = requests.post('https://api.perplexity.ai/chat/completions', headers=headers, json=json_data)
response = requests.post(
"https://api.perplexity.ai/chat/completions",
headers=headers,
json=json_data,
timeout=5,
)
response = response.json()
#print(response)
# print(response)
#print(response["choices"][0]["message"]["content"])
# print(response["choices"][0]["message"]["content"])
with open("response", "w+") as response_file:
response_file.write(response["choices"][0]["message"]["content"])
response_file.write(response["choices"][0]["message"]["content"])