From 4fe0d8e61d692bf6e0c0822436c46b5f9427be74 Mon Sep 17 00:00:00 2001 From: thecookingsenpai Date: Fri, 12 Jan 2024 20:54:35 +0100 Subject: [PATCH] formatted and better coded --- .trunk/.gitignore | 9 ++++ .trunk/configs/.isort.cfg | 2 + .trunk/configs/.markdownlint.yaml | 10 ++++ .trunk/configs/.shellcheckrc | 7 +++ .trunk/configs/ruff.toml | 5 ++ .trunk/trunk.yaml | 31 ++++++++++++ README.md | 4 +- install.sh | 3 +- main.py | 82 ++++++++++++++++++------------- summarizer.py | 35 +++++++------ 10 files changed, 136 insertions(+), 52 deletions(-) create mode 100644 .trunk/.gitignore create mode 100644 .trunk/configs/.isort.cfg create mode 100644 .trunk/configs/.markdownlint.yaml create mode 100644 .trunk/configs/.shellcheckrc create mode 100644 .trunk/configs/ruff.toml create mode 100644 .trunk/trunk.yaml diff --git a/.trunk/.gitignore b/.trunk/.gitignore new file mode 100644 index 0000000..15966d0 --- /dev/null +++ b/.trunk/.gitignore @@ -0,0 +1,9 @@ +*out +*logs +*actions +*notifications +*tools +plugins +user_trunk.yaml +user.yaml +tmp diff --git a/.trunk/configs/.isort.cfg b/.trunk/configs/.isort.cfg new file mode 100644 index 0000000..b9fb3f3 --- /dev/null +++ b/.trunk/configs/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +profile=black diff --git a/.trunk/configs/.markdownlint.yaml b/.trunk/configs/.markdownlint.yaml new file mode 100644 index 0000000..fb94039 --- /dev/null +++ b/.trunk/configs/.markdownlint.yaml @@ -0,0 +1,10 @@ +# Autoformatter friendly markdownlint config (all formatting rules disabled) +default: true +blank_lines: false +bullet: false +html: false +indentation: false +line_length: false +spaces: false +url: false +whitespace: false diff --git a/.trunk/configs/.shellcheckrc b/.trunk/configs/.shellcheckrc new file mode 100644 index 0000000..8c7b1ad --- /dev/null +++ b/.trunk/configs/.shellcheckrc @@ -0,0 +1,7 @@ +enable=all +source-path=SCRIPTDIR +disable=SC2154 + +# If you're having issues with shellcheck following source, disable the errors via: +# disable=SC1090 +# disable=SC1091 diff --git a/.trunk/configs/ruff.toml b/.trunk/configs/ruff.toml new file mode 100644 index 0000000..f5a235c --- /dev/null +++ b/.trunk/configs/ruff.toml @@ -0,0 +1,5 @@ +# Generic, formatter-friendly config. +select = ["B", "D3", "E", "F"] + +# Never enforce `E501` (line length violations). This should be handled by formatters. +ignore = ["E501"] diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml new file mode 100644 index 0000000..5f2415c --- /dev/null +++ b/.trunk/trunk.yaml @@ -0,0 +1,31 @@ +# This file controls the behavior of Trunk: https://docs.trunk.io/cli +# To learn more about the format of this file, see https://docs.trunk.io/reference/trunk-yaml +version: 0.1 +cli: + version: 1.19.0 +# Trunk provides extensibility via plugins. (https://docs.trunk.io/plugins) +plugins: + sources: + - id: trunk + ref: v1.4.2 + uri: https://github.com/trunk-io/plugins +# Many linters and tools depend on runtimes - configure them here. (https://docs.trunk.io/runtimes) +runtimes: + enabled: + - go@1.21.0 + - node@18.12.1 + - python@3.10.8 +# This is the section where you manage your linters. (https://docs.trunk.io/check/configuration) +lint: + enabled: + - bandit@1.7.6 + - black@23.12.1 + - git-diff-check + - isort@5.13.2 + - markdownlint@0.38.0 + - osv-scanner@1.5.0 + - prettier@3.1.1 + - ruff@0.1.12 + - shellcheck@0.9.0 + - shfmt@3.6.0 + - trufflehog@3.63.8 diff --git a/README.md b/README.md index cd57fce..d007dd2 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Your trustworthy unbiased news scraper. ## Disclaimer -This tool is made for personal use, and should be used carefully. Being a scraper for (AllSides)[https://allsides.com], all the material downloaded, used and reworked by this software is property of AllSides. +This tool is made for personal use, and should be used carefully. Being a scraper for [AllSides](https://allsides.com), all the material downloaded, used and reworked by this software is property of AllSides. This tool is intended to be used to quickly grasp an overview of the daily news. Please check AllSides ToS for more information. @@ -25,4 +25,4 @@ Personally, I find their API pricing way better than OpenAI ones. If you are a p ## Run - python main.py \ No newline at end of file + python main.py diff --git a/install.sh b/install.sh index e180c9f..bfa0473 100644 --- a/install.sh +++ b/install.sh @@ -1,8 +1,9 @@ #!/bin/bash + pip install -r requirements.txt mkdir news cp .env.example .env echo "You should now open your .env file and insert your Perplexity API Key." echo "You can get one at: https://www.perplexity.ai/settings/api" echo "Then, launch main.py and wait for it to finish." -echo "allsides.html contains an overview of all the news." \ No newline at end of file +echo "allsides.html contains an overview of all the news." diff --git a/main.py b/main.py index 3afb53d..fe8268d 100644 --- a/main.py +++ b/main.py @@ -1,11 +1,13 @@ -import requests -from bs4 import BeautifulSoup import os +import requests +from bs4 import BeautifulSoup + + def extract_data(url): - response = requests.get(url) - soup = BeautifulSoup(response.text, 'html.parser') - news_items = soup.find_all('div', class_='news-item') + response = requests.get(url, timeout=5) + soup = BeautifulSoup(response.text, "html.parser") + news_items = soup.find_all("div", class_="news-item") datas = [] tot_articles = len(news_items) print("[+] Total news: " + str(tot_articles)) @@ -13,53 +15,62 @@ def extract_data(url): counter = 0 for news_item in news_items: # Extract the article link and title - article_link = news_item.find_all('a')[0].get('href') - if not "allsides.com" in article_link: + article_link = news_item.find_all("a")[0].get("href") + if "allsides.com" not in article_link: tot_articles -= 1 continue - counter += 1 - print("[+] Processing news: " + str(counter) + "/" + str(tot_articles) ) - article_title = news_item.find('div', class_="news-title").text.strip() + counter += 1 + print("[+] Processing news: " + str(counter) + "/" + str(tot_articles)) + article_title = news_item.find("div", class_="news-title").text.strip() print("[*] Summarizing: " + article_link) # Summarize the article with open("link", "w+") as f: f.write(article_link) + # trunk-ignore(bandit/B605) + # trunk-ignore(bandit/B607) os.system("python summarizer.py") print("[OK] Done. Proceeding...") with open("response", "r") as f: article_summary = f.read().strip() - #with open(article_title, "w+") as f: - #f.write(article_summary) + # with open(article_title, "w+") as f: + # f.write(article_summary) # Extract the source and media bias rating try: - source_name = news_item.find('span').text - except: + source_name = news_item.find("span").text + except Exception: source_name = "Unknown" - try: - media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower() - except: + try: + media_bias_rating = ( + news_item.find("img") + .get("alt") + .replace("AllSides Media Bias Rating: ", "") + .lower() + ) + except Exception: media_bias_rating = "Unknown" # Build the JSON data = { - 'article_link': article_link, - 'article_title': article_title, - 'article_summary': article_summary, - 'source_name': source_name, - 'media_bias_rating': media_bias_rating + "article_link": article_link, + "article_title": article_title, + "article_summary": article_summary, + "source_name": source_name, + "media_bias_rating": media_bias_rating, } datas.append(data) return datas + def handle_pagination(soup): - next_page = soup.find('a', {'rel': 'next'}) + next_page = soup.find("a", {"rel": "next"}) if next_page: - return next_page['href'] + return next_page["href"] return None + def main(): url = "https://www.allsides.com/unbiased-balanced-news" all_data = [] @@ -67,7 +78,9 @@ def main(): while url: data = extract_data(url) all_data.extend(data) - url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser')) + url = handle_pagination( + BeautifulSoup(requests.get(url, timeout=5).text, "html.parser") + ) # Prepare a nice CSS for the viewing page (nice and clean) css = """ @@ -93,11 +106,11 @@ def main(): html += "" html += "" for item in all_data: - html += "

" + item['article_title'] + "

" - html += "

" + item['source_name'] + "

" - html += "

" + item['media_bias_rating'] + "

" - html += "

" + item['article_summary'] + "

" - html += "Read the full article" + html += "

" + item["article_title"] + "

" + html += "

" + item["source_name"] + "

" + html += "

" + item["media_bias_rating"] + "

" + html += "

" + item["article_summary"] + "

" + html += "Read the full article" html += "
" html += "" with open("allsides.html", "w+") as f: @@ -107,15 +120,16 @@ def main(): # Do some math to find the number of articles per bias rating bias_ratings = {} for item in all_data: - if item['media_bias_rating'] in bias_ratings: - bias_ratings[item['media_bias_rating']] += 1 + if item["media_bias_rating"] in bias_ratings: + bias_ratings[item["media_bias_rating"]] += 1 else: - bias_ratings[item['media_bias_rating']] = 1 + bias_ratings[item["media_bias_rating"]] = 1 # Assign percentages for key in bias_ratings: bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2) - + print(bias_ratings) + if __name__ == "__main__": main() diff --git a/summarizer.py b/summarizer.py index d6c32f0..1b3c549 100644 --- a/summarizer.py +++ b/summarizer.py @@ -1,6 +1,7 @@ +import os + import requests from dotenv import load_dotenv -import os load_dotenv() @@ -10,32 +11,36 @@ model = os.getenv("MODEL") with open("link", "r") as f: article_link = f.read().strip() - headers = { - 'accept': 'application/json', - 'authorization': 'Bearer ' + pplx_api_key, - 'content-type': 'application/json', + "accept": "application/json", + "authorization": "Bearer " + pplx_api_key, + "content-type": "application/json", } json_data = { - 'model': model, - 'messages': [ + "model": model, + "messages": [ { - 'role': 'system', - 'content': 'Be precise, concise and clear', + "role": "system", + "content": "Be precise, concise and clear", }, { - 'role': 'user', - 'content': 'Search and summarize: ' + article_link, + "role": "user", + "content": "Search and summarize: " + article_link, }, ], } -response = requests.post('https://api.perplexity.ai/chat/completions', headers=headers, json=json_data) +response = requests.post( + "https://api.perplexity.ai/chat/completions", + headers=headers, + json=json_data, + timeout=5, +) response = response.json() -#print(response) +# print(response) -#print(response["choices"][0]["message"]["content"]) +# print(response["choices"][0]["message"]["content"]) with open("response", "w+") as response_file: - response_file.write(response["choices"][0]["message"]["content"]) \ No newline at end of file + response_file.write(response["choices"][0]["message"]["content"])