formatted and better coded

2025-06-06 02:55:21 +00:00 · 2024-01-12 20:54:35 +01:00 · 2024-01-12 20:54:35 +01:00 · 4fe0d8e61d
commit 4fe0d8e61d
parent b6abb2a75e
10 changed files with 136 additions and 52 deletions
--- a/.trunk/.gitignore
+++ b/.trunk/.gitignore
@ -0,0 +1,9 @@
 *out
 *logs
 *actions
 *notifications
 *tools
 plugins
 user_trunk.yaml
 user.yaml
 tmp
--- a/.trunk/configs/.isort.cfg
+++ b/.trunk/configs/.isort.cfg
@ -0,0 +1,2 @@
 [settings]
 profile=black
--- a/.trunk/configs/.markdownlint.yaml
+++ b/.trunk/configs/.markdownlint.yaml
@ -0,0 +1,10 @@
 # Autoformatter friendly markdownlint config (all formatting rules disabled)
 default: true
 blank_lines: false
 bullet: false
 html: false
 indentation: false
 line_length: false
 spaces: false
 url: false
 whitespace: false
--- a/.trunk/configs/.shellcheckrc
+++ b/.trunk/configs/.shellcheckrc
@ -0,0 +1,7 @@
 enable=all
 source-path=SCRIPTDIR
 disable=SC2154
 # If you're having issues with shellcheck following source, disable the errors via:
 # disable=SC1090
 # disable=SC1091
--- a/.trunk/configs/ruff.toml
+++ b/.trunk/configs/ruff.toml
@ -0,0 +1,5 @@
 # Generic, formatter-friendly config.
 select = ["B", "D3", "E", "F"]
 # Never enforce `E501` (line length violations). This should be handled by formatters.
 ignore = ["E501"]
--- a/.trunk/trunk.yaml
+++ b/.trunk/trunk.yaml
@ -0,0 +1,31 @@
 # This file controls the behavior of Trunk: https://docs.trunk.io/cli
 # To learn more about the format of this file, see https://docs.trunk.io/reference/trunk-yaml
 version: 0.1
 cli:
  version: 1.19.0
 # Trunk provides extensibility via plugins. (https://docs.trunk.io/plugins)
 plugins:
  sources:
    - id: trunk
      ref: v1.4.2
      uri: https://github.com/trunk-io/plugins
 # Many linters and tools depend on runtimes - configure them here. (https://docs.trunk.io/runtimes)
 runtimes:
  enabled:
    - go@1.21.0
    - node@18.12.1
    - python@3.10.8
 # This is the section where you manage your linters. (https://docs.trunk.io/check/configuration)
 lint:
  enabled:
    - bandit@1.7.6
    - black@23.12.1
    - git-diff-check
    - isort@5.13.2
    - markdownlint@0.38.0
    - osv-scanner@1.5.0
    - prettier@3.1.1
    - ruff@0.1.12
    - shellcheck@0.9.0
    - shfmt@3.6.0
    - trufflehog@3.63.8
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@ Your trustworthy unbiased news scraper.
 ## Disclaimer
-This tool is made for personal use, and should be used carefully. Being a scraper for (AllSides)[https://allsides.com], all the material downloaded, used and reworked by this software is property of AllSides.
+This tool is made for personal use, and should be used carefully. Being a scraper for [AllSides](https://allsides.com), all the material downloaded, used and reworked by this software is property of AllSides.
 This tool is intended to be used to quickly grasp an overview of the daily news.
 Please check AllSides ToS for more information.
--- a/install.sh
+++ b/install.sh
@ -1,4 +1,5 @@
 #!/bin/bash
 pip install -r requirements.txt
 mkdir news
 cp .env.example .env
--- a/main.py
+++ b/main.py
@ -1,11 +1,13 @@
 import requests
 from bs4 import BeautifulSoup
 import os
 import requests
 from bs4 import BeautifulSoup
 def extract_data(url):
-    response = requests.get(url)
+    response = requests.get(url, timeout=5)
-    soup = BeautifulSoup(response.text, 'html.parser')
+    soup = BeautifulSoup(response.text, "html.parser")
-    news_items = soup.find_all('div', class_='news-item')
+    news_items = soup.find_all("div", class_="news-item")
    datas = []
    tot_articles = len(news_items)
    print("[+] Total news: " + str(tot_articles))
@ -13,17 +15,19 @@ def extract_data(url):
    counter = 0
    for news_item in news_items:
        # Extract the article link and title
-        article_link = news_item.find_all('a')[0].get('href')
+        article_link = news_item.find_all("a")[0].get("href")
-        if not "allsides.com" in article_link:
+        if "allsides.com" not in article_link:
            tot_articles -= 1
            continue
        counter += 1
        print("[+] Processing news: " + str(counter) + "/" + str(tot_articles))
-        article_title = news_item.find('div', class_="news-title").text.strip()
+        article_title = news_item.find("div", class_="news-title").text.strip()
        print("[*] Summarizing: " + article_link)
        # Summarize the article
        with open("link", "w+") as f:
            f.write(article_link)
        # trunk-ignore(bandit/B605)
        # trunk-ignore(bandit/B607)
        os.system("python summarizer.py")
        print("[OK] Done. Proceeding...")
        with open("response", "r") as f:
@ -32,34 +36,41 @@ def extract_data(url):
        # f.write(article_summary)
        # Extract the source and media bias rating
        try:
-            source_name = news_item.find('span').text
+            source_name = news_item.find("span").text
-        except:
+        except Exception:
            source_name = "Unknown"
        try:
-            media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower()
+            media_bias_rating = (
-        except:
+                news_item.find("img")
                .get("alt")
                .replace("AllSides Media Bias Rating: ", "")
                .lower()
            )
        except Exception:
            media_bias_rating = "Unknown"
        # Build the JSON
        data = {
-            'article_link': article_link,
+            "article_link": article_link,
-            'article_title': article_title,
+            "article_title": article_title,
-            'article_summary': article_summary,
+            "article_summary": article_summary,
-            'source_name': source_name,
+            "source_name": source_name,
-            'media_bias_rating': media_bias_rating
+            "media_bias_rating": media_bias_rating,
        }
        datas.append(data)
    return datas
 def handle_pagination(soup):
-    next_page = soup.find('a', {'rel': 'next'})
+    next_page = soup.find("a", {"rel": "next"})
    if next_page:
-        return next_page['href']
+        return next_page["href"]
    return None
 def main():
    url = "https://www.allsides.com/unbiased-balanced-news"
    all_data = []
@ -67,7 +78,9 @@ def main():
    while url:
        data = extract_data(url)
        all_data.extend(data)
-        url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser'))
+        url = handle_pagination(
            BeautifulSoup(requests.get(url, timeout=5).text, "html.parser")
        )
    # Prepare a nice CSS for the viewing page (nice and clean)
    css = """
@ -93,11 +106,11 @@ def main():
    html += "<style>" + css + "</style>"
    html += "</head><body>"
    for item in all_data:
-        html += "<h1>" + item['article_title'] + "</h1>"
+        html += "<h1>" + item["article_title"] + "</h1>"
-        html += "<h2>" + item['source_name'] + "</h2>"
+        html += "<h2>" + item["source_name"] + "</h2>"
-        html += "<h3>" + item['media_bias_rating'] + "</h3>"
+        html += "<h3>" + item["media_bias_rating"] + "</h3>"
-        html += "<p>" + item['article_summary'] + "</p>"
+        html += "<p>" + item["article_summary"] + "</p>"
-        html += "<a href='" + item['article_link'] + "'>Read the full article</a>"
+        html += "<a href='" + item["article_link"] + "'>Read the full article</a>"
        html += "<hr>"
    html += "</body></html>"
    with open("allsides.html", "w+") as f:
@ -107,15 +120,16 @@ def main():
    # Do some math to find the number of articles per bias rating
    bias_ratings = {}
    for item in all_data:
-        if item['media_bias_rating'] in bias_ratings:
+        if item["media_bias_rating"] in bias_ratings:
-            bias_ratings[item['media_bias_rating']] += 1
+            bias_ratings[item["media_bias_rating"]] += 1
        else:
-            bias_ratings[item['media_bias_rating']] = 1
+            bias_ratings[item["media_bias_rating"]] = 1
    # Assign percentages
    for key in bias_ratings:
        bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
    print(bias_ratings)
 if __name__ == "__main__":
    main()
--- a/summarizer.py
+++ b/summarizer.py
@ -1,6 +1,7 @@
 import os
 import requests
 from dotenv import load_dotenv
 import os
 load_dotenv()
@ -10,28 +11,32 @@ model = os.getenv("MODEL")
 with open("link", "r") as f:
    article_link = f.read().strip()
 headers = {
-    'accept': 'application/json',
+    "accept": "application/json",
-    'authorization': 'Bearer ' + pplx_api_key,
+    "authorization": "Bearer " + pplx_api_key,
-    'content-type': 'application/json',
+    "content-type": "application/json",
 }
 json_data = {
-    'model': model,
+    "model": model,
-    'messages': [
+    "messages": [
        {
-            'role': 'system',
+            "role": "system",
-            'content': 'Be precise, concise and clear',
+            "content": "Be precise, concise and clear",
        },
        {
-            'role': 'user',
+            "role": "user",
-            'content': 'Search and summarize: ' + article_link,
+            "content": "Search and summarize: " + article_link,
        },
    ],
 }
-response = requests.post('https://api.perplexity.ai/chat/completions', headers=headers, json=json_data)
+response = requests.post(
    "https://api.perplexity.ai/chat/completions",
    headers=headers,
    json=json_data,
    timeout=5,
 )
 response = response.json()
 # print(response)