mirror of
https://github.com/tcsenpai/mysides.git
synced 2025-06-02 17:20:05 +00:00
formatted and better coded
This commit is contained in:
parent
b6abb2a75e
commit
4fe0d8e61d
9
.trunk/.gitignore
vendored
Normal file
9
.trunk/.gitignore
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
*out
|
||||
*logs
|
||||
*actions
|
||||
*notifications
|
||||
*tools
|
||||
plugins
|
||||
user_trunk.yaml
|
||||
user.yaml
|
||||
tmp
|
2
.trunk/configs/.isort.cfg
Normal file
2
.trunk/configs/.isort.cfg
Normal file
@ -0,0 +1,2 @@
|
||||
[settings]
|
||||
profile=black
|
10
.trunk/configs/.markdownlint.yaml
Normal file
10
.trunk/configs/.markdownlint.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
# Autoformatter friendly markdownlint config (all formatting rules disabled)
|
||||
default: true
|
||||
blank_lines: false
|
||||
bullet: false
|
||||
html: false
|
||||
indentation: false
|
||||
line_length: false
|
||||
spaces: false
|
||||
url: false
|
||||
whitespace: false
|
7
.trunk/configs/.shellcheckrc
Normal file
7
.trunk/configs/.shellcheckrc
Normal file
@ -0,0 +1,7 @@
|
||||
enable=all
|
||||
source-path=SCRIPTDIR
|
||||
disable=SC2154
|
||||
|
||||
# If you're having issues with shellcheck following source, disable the errors via:
|
||||
# disable=SC1090
|
||||
# disable=SC1091
|
5
.trunk/configs/ruff.toml
Normal file
5
.trunk/configs/ruff.toml
Normal file
@ -0,0 +1,5 @@
|
||||
# Generic, formatter-friendly config.
|
||||
select = ["B", "D3", "E", "F"]
|
||||
|
||||
# Never enforce `E501` (line length violations). This should be handled by formatters.
|
||||
ignore = ["E501"]
|
31
.trunk/trunk.yaml
Normal file
31
.trunk/trunk.yaml
Normal file
@ -0,0 +1,31 @@
|
||||
# This file controls the behavior of Trunk: https://docs.trunk.io/cli
|
||||
# To learn more about the format of this file, see https://docs.trunk.io/reference/trunk-yaml
|
||||
version: 0.1
|
||||
cli:
|
||||
version: 1.19.0
|
||||
# Trunk provides extensibility via plugins. (https://docs.trunk.io/plugins)
|
||||
plugins:
|
||||
sources:
|
||||
- id: trunk
|
||||
ref: v1.4.2
|
||||
uri: https://github.com/trunk-io/plugins
|
||||
# Many linters and tools depend on runtimes - configure them here. (https://docs.trunk.io/runtimes)
|
||||
runtimes:
|
||||
enabled:
|
||||
- go@1.21.0
|
||||
- node@18.12.1
|
||||
- python@3.10.8
|
||||
# This is the section where you manage your linters. (https://docs.trunk.io/check/configuration)
|
||||
lint:
|
||||
enabled:
|
||||
- bandit@1.7.6
|
||||
- black@23.12.1
|
||||
- git-diff-check
|
||||
- isort@5.13.2
|
||||
- markdownlint@0.38.0
|
||||
- osv-scanner@1.5.0
|
||||
- prettier@3.1.1
|
||||
- ruff@0.1.12
|
||||
- shellcheck@0.9.0
|
||||
- shfmt@3.6.0
|
||||
- trufflehog@3.63.8
|
@ -4,7 +4,7 @@ Your trustworthy unbiased news scraper.
|
||||
|
||||
## Disclaimer
|
||||
|
||||
This tool is made for personal use, and should be used carefully. Being a scraper for (AllSides)[https://allsides.com], all the material downloaded, used and reworked by this software is property of AllSides.
|
||||
This tool is made for personal use, and should be used carefully. Being a scraper for [AllSides](https://allsides.com), all the material downloaded, used and reworked by this software is property of AllSides.
|
||||
|
||||
This tool is intended to be used to quickly grasp an overview of the daily news.
|
||||
Please check AllSides ToS for more information.
|
||||
@ -25,4 +25,4 @@ Personally, I find their API pricing way better than OpenAI ones. If you are a p
|
||||
|
||||
## Run
|
||||
|
||||
python main.py
|
||||
python main.py
|
||||
|
@ -1,8 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
pip install -r requirements.txt
|
||||
mkdir news
|
||||
cp .env.example .env
|
||||
echo "You should now open your .env file and insert your Perplexity API Key."
|
||||
echo "You can get one at: https://www.perplexity.ai/settings/api"
|
||||
echo "Then, launch main.py and wait for it to finish."
|
||||
echo "allsides.html contains an overview of all the news."
|
||||
echo "allsides.html contains an overview of all the news."
|
||||
|
82
main.py
82
main.py
@ -1,11 +1,13 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def extract_data(url):
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
news_items = soup.find_all('div', class_='news-item')
|
||||
response = requests.get(url, timeout=5)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
news_items = soup.find_all("div", class_="news-item")
|
||||
datas = []
|
||||
tot_articles = len(news_items)
|
||||
print("[+] Total news: " + str(tot_articles))
|
||||
@ -13,53 +15,62 @@ def extract_data(url):
|
||||
counter = 0
|
||||
for news_item in news_items:
|
||||
# Extract the article link and title
|
||||
article_link = news_item.find_all('a')[0].get('href')
|
||||
if not "allsides.com" in article_link:
|
||||
article_link = news_item.find_all("a")[0].get("href")
|
||||
if "allsides.com" not in article_link:
|
||||
tot_articles -= 1
|
||||
continue
|
||||
counter += 1
|
||||
print("[+] Processing news: " + str(counter) + "/" + str(tot_articles) )
|
||||
article_title = news_item.find('div', class_="news-title").text.strip()
|
||||
counter += 1
|
||||
print("[+] Processing news: " + str(counter) + "/" + str(tot_articles))
|
||||
article_title = news_item.find("div", class_="news-title").text.strip()
|
||||
print("[*] Summarizing: " + article_link)
|
||||
# Summarize the article
|
||||
with open("link", "w+") as f:
|
||||
f.write(article_link)
|
||||
# trunk-ignore(bandit/B605)
|
||||
# trunk-ignore(bandit/B607)
|
||||
os.system("python summarizer.py")
|
||||
print("[OK] Done. Proceeding...")
|
||||
with open("response", "r") as f:
|
||||
article_summary = f.read().strip()
|
||||
#with open(article_title, "w+") as f:
|
||||
#f.write(article_summary)
|
||||
# with open(article_title, "w+") as f:
|
||||
# f.write(article_summary)
|
||||
# Extract the source and media bias rating
|
||||
try:
|
||||
source_name = news_item.find('span').text
|
||||
except:
|
||||
source_name = news_item.find("span").text
|
||||
except Exception:
|
||||
source_name = "Unknown"
|
||||
|
||||
try:
|
||||
media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower()
|
||||
except:
|
||||
try:
|
||||
media_bias_rating = (
|
||||
news_item.find("img")
|
||||
.get("alt")
|
||||
.replace("AllSides Media Bias Rating: ", "")
|
||||
.lower()
|
||||
)
|
||||
except Exception:
|
||||
media_bias_rating = "Unknown"
|
||||
|
||||
# Build the JSON
|
||||
data = {
|
||||
'article_link': article_link,
|
||||
'article_title': article_title,
|
||||
'article_summary': article_summary,
|
||||
'source_name': source_name,
|
||||
'media_bias_rating': media_bias_rating
|
||||
"article_link": article_link,
|
||||
"article_title": article_title,
|
||||
"article_summary": article_summary,
|
||||
"source_name": source_name,
|
||||
"media_bias_rating": media_bias_rating,
|
||||
}
|
||||
|
||||
datas.append(data)
|
||||
|
||||
return datas
|
||||
|
||||
|
||||
def handle_pagination(soup):
|
||||
next_page = soup.find('a', {'rel': 'next'})
|
||||
next_page = soup.find("a", {"rel": "next"})
|
||||
if next_page:
|
||||
return next_page['href']
|
||||
return next_page["href"]
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
url = "https://www.allsides.com/unbiased-balanced-news"
|
||||
all_data = []
|
||||
@ -67,7 +78,9 @@ def main():
|
||||
while url:
|
||||
data = extract_data(url)
|
||||
all_data.extend(data)
|
||||
url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser'))
|
||||
url = handle_pagination(
|
||||
BeautifulSoup(requests.get(url, timeout=5).text, "html.parser")
|
||||
)
|
||||
|
||||
# Prepare a nice CSS for the viewing page (nice and clean)
|
||||
css = """
|
||||
@ -93,11 +106,11 @@ def main():
|
||||
html += "<style>" + css + "</style>"
|
||||
html += "</head><body>"
|
||||
for item in all_data:
|
||||
html += "<h1>" + item['article_title'] + "</h1>"
|
||||
html += "<h2>" + item['source_name'] + "</h2>"
|
||||
html += "<h3>" + item['media_bias_rating'] + "</h3>"
|
||||
html += "<p>" + item['article_summary'] + "</p>"
|
||||
html += "<a href='" + item['article_link'] + "'>Read the full article</a>"
|
||||
html += "<h1>" + item["article_title"] + "</h1>"
|
||||
html += "<h2>" + item["source_name"] + "</h2>"
|
||||
html += "<h3>" + item["media_bias_rating"] + "</h3>"
|
||||
html += "<p>" + item["article_summary"] + "</p>"
|
||||
html += "<a href='" + item["article_link"] + "'>Read the full article</a>"
|
||||
html += "<hr>"
|
||||
html += "</body></html>"
|
||||
with open("allsides.html", "w+") as f:
|
||||
@ -107,15 +120,16 @@ def main():
|
||||
# Do some math to find the number of articles per bias rating
|
||||
bias_ratings = {}
|
||||
for item in all_data:
|
||||
if item['media_bias_rating'] in bias_ratings:
|
||||
bias_ratings[item['media_bias_rating']] += 1
|
||||
if item["media_bias_rating"] in bias_ratings:
|
||||
bias_ratings[item["media_bias_rating"]] += 1
|
||||
else:
|
||||
bias_ratings[item['media_bias_rating']] = 1
|
||||
bias_ratings[item["media_bias_rating"]] = 1
|
||||
# Assign percentages
|
||||
for key in bias_ratings:
|
||||
bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
|
||||
|
||||
|
||||
print(bias_ratings)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -1,6 +1,7 @@
|
||||
import os
|
||||
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -10,32 +11,36 @@ model = os.getenv("MODEL")
|
||||
with open("link", "r") as f:
|
||||
article_link = f.read().strip()
|
||||
|
||||
|
||||
headers = {
|
||||
'accept': 'application/json',
|
||||
'authorization': 'Bearer ' + pplx_api_key,
|
||||
'content-type': 'application/json',
|
||||
"accept": "application/json",
|
||||
"authorization": "Bearer " + pplx_api_key,
|
||||
"content-type": "application/json",
|
||||
}
|
||||
|
||||
json_data = {
|
||||
'model': model,
|
||||
'messages': [
|
||||
"model": model,
|
||||
"messages": [
|
||||
{
|
||||
'role': 'system',
|
||||
'content': 'Be precise, concise and clear',
|
||||
"role": "system",
|
||||
"content": "Be precise, concise and clear",
|
||||
},
|
||||
{
|
||||
'role': 'user',
|
||||
'content': 'Search and summarize: ' + article_link,
|
||||
"role": "user",
|
||||
"content": "Search and summarize: " + article_link,
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
response = requests.post('https://api.perplexity.ai/chat/completions', headers=headers, json=json_data)
|
||||
response = requests.post(
|
||||
"https://api.perplexity.ai/chat/completions",
|
||||
headers=headers,
|
||||
json=json_data,
|
||||
timeout=5,
|
||||
)
|
||||
|
||||
response = response.json()
|
||||
#print(response)
|
||||
# print(response)
|
||||
|
||||
#print(response["choices"][0]["message"]["content"])
|
||||
# print(response["choices"][0]["message"]["content"])
|
||||
with open("response", "w+") as response_file:
|
||||
response_file.write(response["choices"][0]["message"]["content"])
|
||||
response_file.write(response["choices"][0]["message"]["content"])
|
||||
|
Loading…
x
Reference in New Issue
Block a user