mirror of
https://github.com/tcsenpai/mysides.git
synced 2025-06-06 02:55:21 +00:00
formatted and better coded
This commit is contained in:
parent
b6abb2a75e
commit
4fe0d8e61d
9
.trunk/.gitignore
vendored
Normal file
9
.trunk/.gitignore
vendored
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
*out
|
||||||
|
*logs
|
||||||
|
*actions
|
||||||
|
*notifications
|
||||||
|
*tools
|
||||||
|
plugins
|
||||||
|
user_trunk.yaml
|
||||||
|
user.yaml
|
||||||
|
tmp
|
2
.trunk/configs/.isort.cfg
Normal file
2
.trunk/configs/.isort.cfg
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
[settings]
|
||||||
|
profile=black
|
10
.trunk/configs/.markdownlint.yaml
Normal file
10
.trunk/configs/.markdownlint.yaml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
# Autoformatter friendly markdownlint config (all formatting rules disabled)
|
||||||
|
default: true
|
||||||
|
blank_lines: false
|
||||||
|
bullet: false
|
||||||
|
html: false
|
||||||
|
indentation: false
|
||||||
|
line_length: false
|
||||||
|
spaces: false
|
||||||
|
url: false
|
||||||
|
whitespace: false
|
7
.trunk/configs/.shellcheckrc
Normal file
7
.trunk/configs/.shellcheckrc
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
enable=all
|
||||||
|
source-path=SCRIPTDIR
|
||||||
|
disable=SC2154
|
||||||
|
|
||||||
|
# If you're having issues with shellcheck following source, disable the errors via:
|
||||||
|
# disable=SC1090
|
||||||
|
# disable=SC1091
|
5
.trunk/configs/ruff.toml
Normal file
5
.trunk/configs/ruff.toml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# Generic, formatter-friendly config.
|
||||||
|
select = ["B", "D3", "E", "F"]
|
||||||
|
|
||||||
|
# Never enforce `E501` (line length violations). This should be handled by formatters.
|
||||||
|
ignore = ["E501"]
|
31
.trunk/trunk.yaml
Normal file
31
.trunk/trunk.yaml
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# This file controls the behavior of Trunk: https://docs.trunk.io/cli
|
||||||
|
# To learn more about the format of this file, see https://docs.trunk.io/reference/trunk-yaml
|
||||||
|
version: 0.1
|
||||||
|
cli:
|
||||||
|
version: 1.19.0
|
||||||
|
# Trunk provides extensibility via plugins. (https://docs.trunk.io/plugins)
|
||||||
|
plugins:
|
||||||
|
sources:
|
||||||
|
- id: trunk
|
||||||
|
ref: v1.4.2
|
||||||
|
uri: https://github.com/trunk-io/plugins
|
||||||
|
# Many linters and tools depend on runtimes - configure them here. (https://docs.trunk.io/runtimes)
|
||||||
|
runtimes:
|
||||||
|
enabled:
|
||||||
|
- go@1.21.0
|
||||||
|
- node@18.12.1
|
||||||
|
- python@3.10.8
|
||||||
|
# This is the section where you manage your linters. (https://docs.trunk.io/check/configuration)
|
||||||
|
lint:
|
||||||
|
enabled:
|
||||||
|
- bandit@1.7.6
|
||||||
|
- black@23.12.1
|
||||||
|
- git-diff-check
|
||||||
|
- isort@5.13.2
|
||||||
|
- markdownlint@0.38.0
|
||||||
|
- osv-scanner@1.5.0
|
||||||
|
- prettier@3.1.1
|
||||||
|
- ruff@0.1.12
|
||||||
|
- shellcheck@0.9.0
|
||||||
|
- shfmt@3.6.0
|
||||||
|
- trufflehog@3.63.8
|
@ -4,7 +4,7 @@ Your trustworthy unbiased news scraper.
|
|||||||
|
|
||||||
## Disclaimer
|
## Disclaimer
|
||||||
|
|
||||||
This tool is made for personal use, and should be used carefully. Being a scraper for (AllSides)[https://allsides.com], all the material downloaded, used and reworked by this software is property of AllSides.
|
This tool is made for personal use, and should be used carefully. Being a scraper for [AllSides](https://allsides.com), all the material downloaded, used and reworked by this software is property of AllSides.
|
||||||
|
|
||||||
This tool is intended to be used to quickly grasp an overview of the daily news.
|
This tool is intended to be used to quickly grasp an overview of the daily news.
|
||||||
Please check AllSides ToS for more information.
|
Please check AllSides ToS for more information.
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
mkdir news
|
mkdir news
|
||||||
cp .env.example .env
|
cp .env.example .env
|
||||||
|
70
main.py
70
main.py
@ -1,11 +1,13 @@
|
|||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
def extract_data(url):
|
def extract_data(url):
|
||||||
response = requests.get(url)
|
response = requests.get(url, timeout=5)
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
news_items = soup.find_all('div', class_='news-item')
|
news_items = soup.find_all("div", class_="news-item")
|
||||||
datas = []
|
datas = []
|
||||||
tot_articles = len(news_items)
|
tot_articles = len(news_items)
|
||||||
print("[+] Total news: " + str(tot_articles))
|
print("[+] Total news: " + str(tot_articles))
|
||||||
@ -13,17 +15,19 @@ def extract_data(url):
|
|||||||
counter = 0
|
counter = 0
|
||||||
for news_item in news_items:
|
for news_item in news_items:
|
||||||
# Extract the article link and title
|
# Extract the article link and title
|
||||||
article_link = news_item.find_all('a')[0].get('href')
|
article_link = news_item.find_all("a")[0].get("href")
|
||||||
if not "allsides.com" in article_link:
|
if "allsides.com" not in article_link:
|
||||||
tot_articles -= 1
|
tot_articles -= 1
|
||||||
continue
|
continue
|
||||||
counter += 1
|
counter += 1
|
||||||
print("[+] Processing news: " + str(counter) + "/" + str(tot_articles))
|
print("[+] Processing news: " + str(counter) + "/" + str(tot_articles))
|
||||||
article_title = news_item.find('div', class_="news-title").text.strip()
|
article_title = news_item.find("div", class_="news-title").text.strip()
|
||||||
print("[*] Summarizing: " + article_link)
|
print("[*] Summarizing: " + article_link)
|
||||||
# Summarize the article
|
# Summarize the article
|
||||||
with open("link", "w+") as f:
|
with open("link", "w+") as f:
|
||||||
f.write(article_link)
|
f.write(article_link)
|
||||||
|
# trunk-ignore(bandit/B605)
|
||||||
|
# trunk-ignore(bandit/B607)
|
||||||
os.system("python summarizer.py")
|
os.system("python summarizer.py")
|
||||||
print("[OK] Done. Proceeding...")
|
print("[OK] Done. Proceeding...")
|
||||||
with open("response", "r") as f:
|
with open("response", "r") as f:
|
||||||
@ -32,34 +36,41 @@ def extract_data(url):
|
|||||||
# f.write(article_summary)
|
# f.write(article_summary)
|
||||||
# Extract the source and media bias rating
|
# Extract the source and media bias rating
|
||||||
try:
|
try:
|
||||||
source_name = news_item.find('span').text
|
source_name = news_item.find("span").text
|
||||||
except:
|
except Exception:
|
||||||
source_name = "Unknown"
|
source_name = "Unknown"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower()
|
media_bias_rating = (
|
||||||
except:
|
news_item.find("img")
|
||||||
|
.get("alt")
|
||||||
|
.replace("AllSides Media Bias Rating: ", "")
|
||||||
|
.lower()
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
media_bias_rating = "Unknown"
|
media_bias_rating = "Unknown"
|
||||||
|
|
||||||
# Build the JSON
|
# Build the JSON
|
||||||
data = {
|
data = {
|
||||||
'article_link': article_link,
|
"article_link": article_link,
|
||||||
'article_title': article_title,
|
"article_title": article_title,
|
||||||
'article_summary': article_summary,
|
"article_summary": article_summary,
|
||||||
'source_name': source_name,
|
"source_name": source_name,
|
||||||
'media_bias_rating': media_bias_rating
|
"media_bias_rating": media_bias_rating,
|
||||||
}
|
}
|
||||||
|
|
||||||
datas.append(data)
|
datas.append(data)
|
||||||
|
|
||||||
return datas
|
return datas
|
||||||
|
|
||||||
|
|
||||||
def handle_pagination(soup):
|
def handle_pagination(soup):
|
||||||
next_page = soup.find('a', {'rel': 'next'})
|
next_page = soup.find("a", {"rel": "next"})
|
||||||
if next_page:
|
if next_page:
|
||||||
return next_page['href']
|
return next_page["href"]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
url = "https://www.allsides.com/unbiased-balanced-news"
|
url = "https://www.allsides.com/unbiased-balanced-news"
|
||||||
all_data = []
|
all_data = []
|
||||||
@ -67,7 +78,9 @@ def main():
|
|||||||
while url:
|
while url:
|
||||||
data = extract_data(url)
|
data = extract_data(url)
|
||||||
all_data.extend(data)
|
all_data.extend(data)
|
||||||
url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser'))
|
url = handle_pagination(
|
||||||
|
BeautifulSoup(requests.get(url, timeout=5).text, "html.parser")
|
||||||
|
)
|
||||||
|
|
||||||
# Prepare a nice CSS for the viewing page (nice and clean)
|
# Prepare a nice CSS for the viewing page (nice and clean)
|
||||||
css = """
|
css = """
|
||||||
@ -93,11 +106,11 @@ def main():
|
|||||||
html += "<style>" + css + "</style>"
|
html += "<style>" + css + "</style>"
|
||||||
html += "</head><body>"
|
html += "</head><body>"
|
||||||
for item in all_data:
|
for item in all_data:
|
||||||
html += "<h1>" + item['article_title'] + "</h1>"
|
html += "<h1>" + item["article_title"] + "</h1>"
|
||||||
html += "<h2>" + item['source_name'] + "</h2>"
|
html += "<h2>" + item["source_name"] + "</h2>"
|
||||||
html += "<h3>" + item['media_bias_rating'] + "</h3>"
|
html += "<h3>" + item["media_bias_rating"] + "</h3>"
|
||||||
html += "<p>" + item['article_summary'] + "</p>"
|
html += "<p>" + item["article_summary"] + "</p>"
|
||||||
html += "<a href='" + item['article_link'] + "'>Read the full article</a>"
|
html += "<a href='" + item["article_link"] + "'>Read the full article</a>"
|
||||||
html += "<hr>"
|
html += "<hr>"
|
||||||
html += "</body></html>"
|
html += "</body></html>"
|
||||||
with open("allsides.html", "w+") as f:
|
with open("allsides.html", "w+") as f:
|
||||||
@ -107,15 +120,16 @@ def main():
|
|||||||
# Do some math to find the number of articles per bias rating
|
# Do some math to find the number of articles per bias rating
|
||||||
bias_ratings = {}
|
bias_ratings = {}
|
||||||
for item in all_data:
|
for item in all_data:
|
||||||
if item['media_bias_rating'] in bias_ratings:
|
if item["media_bias_rating"] in bias_ratings:
|
||||||
bias_ratings[item['media_bias_rating']] += 1
|
bias_ratings[item["media_bias_rating"]] += 1
|
||||||
else:
|
else:
|
||||||
bias_ratings[item['media_bias_rating']] = 1
|
bias_ratings[item["media_bias_rating"]] = 1
|
||||||
# Assign percentages
|
# Assign percentages
|
||||||
for key in bias_ratings:
|
for key in bias_ratings:
|
||||||
bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
|
bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
|
||||||
|
|
||||||
print(bias_ratings)
|
print(bias_ratings)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import os
|
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
@ -10,28 +11,32 @@ model = os.getenv("MODEL")
|
|||||||
with open("link", "r") as f:
|
with open("link", "r") as f:
|
||||||
article_link = f.read().strip()
|
article_link = f.read().strip()
|
||||||
|
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
'accept': 'application/json',
|
"accept": "application/json",
|
||||||
'authorization': 'Bearer ' + pplx_api_key,
|
"authorization": "Bearer " + pplx_api_key,
|
||||||
'content-type': 'application/json',
|
"content-type": "application/json",
|
||||||
}
|
}
|
||||||
|
|
||||||
json_data = {
|
json_data = {
|
||||||
'model': model,
|
"model": model,
|
||||||
'messages': [
|
"messages": [
|
||||||
{
|
{
|
||||||
'role': 'system',
|
"role": "system",
|
||||||
'content': 'Be precise, concise and clear',
|
"content": "Be precise, concise and clear",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
'role': 'user',
|
"role": "user",
|
||||||
'content': 'Search and summarize: ' + article_link,
|
"content": "Search and summarize: " + article_link,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
response = requests.post('https://api.perplexity.ai/chat/completions', headers=headers, json=json_data)
|
response = requests.post(
|
||||||
|
"https://api.perplexity.ai/chat/completions",
|
||||||
|
headers=headers,
|
||||||
|
json=json_data,
|
||||||
|
timeout=5,
|
||||||
|
)
|
||||||
|
|
||||||
response = response.json()
|
response = response.json()
|
||||||
# print(response)
|
# print(response)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user