formatted and better coded

2025-07-28 21:41:47 +00:00 · 2024-01-12 20:54:35 +01:00 · 2024-01-12 20:54:35 +01:00 · 4fe0d8e61d
commit 4fe0d8e61d
parent b6abb2a75e
10 changed files with 136 additions and 52 deletions
--- a/.trunk/.gitignore
+++ b/.trunk/.gitignore
@ -0,0 +1,9 @@
+*out
+*logs
+*actions
+*notifications
+*tools
+plugins
+user_trunk.yaml
+user.yaml
+tmp
--- a/.trunk/configs/.isort.cfg
+++ b/.trunk/configs/.isort.cfg
@ -0,0 +1,2 @@
+[settings]
+profile=black
--- a/.trunk/configs/.markdownlint.yaml
+++ b/.trunk/configs/.markdownlint.yaml
@ -0,0 +1,10 @@
+# Autoformatter friendly markdownlint config (all formatting rules disabled)
+default: true
+blank_lines: false
+bullet: false
+html: false
+indentation: false
+line_length: false
+spaces: false
+url: false
+whitespace: false
--- a/.trunk/configs/.shellcheckrc
+++ b/.trunk/configs/.shellcheckrc
@ -0,0 +1,7 @@
+enable=all
+source-path=SCRIPTDIR
+disable=SC2154
+
+# If you're having issues with shellcheck following source, disable the errors via:
+# disable=SC1090
+# disable=SC1091
--- a/.trunk/configs/ruff.toml
+++ b/.trunk/configs/ruff.toml
@ -0,0 +1,5 @@
+# Generic, formatter-friendly config.
+select = ["B", "D3", "E", "F"]
+
+# Never enforce `E501` (line length violations). This should be handled by formatters.
+ignore = ["E501"]
--- a/.trunk/trunk.yaml
+++ b/.trunk/trunk.yaml
@ -0,0 +1,31 @@
+# This file controls the behavior of Trunk: https://docs.trunk.io/cli
+# To learn more about the format of this file, see https://docs.trunk.io/reference/trunk-yaml
+version: 0.1
+cli:
+  version: 1.19.0
+# Trunk provides extensibility via plugins. (https://docs.trunk.io/plugins)
+plugins:
+  sources:
+    - id: trunk
+      ref: v1.4.2
+      uri: https://github.com/trunk-io/plugins
+# Many linters and tools depend on runtimes - configure them here. (https://docs.trunk.io/runtimes)
+runtimes:
+  enabled:
+    - go@1.21.0
+    - node@18.12.1
+    - python@3.10.8
+# This is the section where you manage your linters. (https://docs.trunk.io/check/configuration)
+lint:
+  enabled:
+    - bandit@1.7.6
+    - black@23.12.1
+    - git-diff-check
+    - isort@5.13.2
+    - markdownlint@0.38.0
+    - osv-scanner@1.5.0
+    - prettier@3.1.1
+    - ruff@0.1.12
+    - shellcheck@0.9.0
+    - shfmt@3.6.0
+    - trufflehog@3.63.8
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@ Your trustworthy unbiased news scraper.

 ## Disclaimer

-This tool is made for personal use, and should be used carefully. Being a scraper for (AllSides)[https://allsides.com], all the material downloaded, used and reworked by this software is property of AllSides.
+This tool is made for personal use, and should be used carefully. Being a scraper for [AllSides](https://allsides.com), all the material downloaded, used and reworked by this software is property of AllSides.

 This tool is intended to be used to quickly grasp an overview of the daily news.
 Please check AllSides ToS for more information.
@ -25,4 +25,4 @@ Personally, I find their API pricing way better than OpenAI ones. If you are a p

 ## Run

-    python main.py
+    python main.py
--- a/install.sh
+++ b/install.sh
@ -1,8 +1,9 @@
 #!/bin/bash
+
 pip install -r requirements.txt
 mkdir news
 cp .env.example .env
 echo "You should now open your .env file and insert your Perplexity API Key."
 echo "You can get one at: https://www.perplexity.ai/settings/api"
 echo "Then, launch main.py and wait for it to finish."
-echo "allsides.html contains an overview of all the news."
+echo "allsides.html contains an overview of all the news."
--- a/main.py
+++ b/main.py
@ -1,11 +1,13 @@
-import requests
-from bs4 import BeautifulSoup
 import os

+import requests
+from bs4 import BeautifulSoup
+
+
 def extract_data(url):
-    response = requests.get(url)
-    soup = BeautifulSoup(response.text, 'html.parser')
-    news_items = soup.find_all('div', class_='news-item')
+    response = requests.get(url, timeout=5)
+    soup = BeautifulSoup(response.text, "html.parser")
+    news_items = soup.find_all("div", class_="news-item")
    datas = []
    tot_articles = len(news_items)
    print("[+] Total news: " + str(tot_articles))
@ -13,53 +15,62 @@ def extract_data(url):
    counter = 0
    for news_item in news_items:
        # Extract the article link and title
-        article_link = news_item.find_all('a')[0].get('href')
-        if not "allsides.com" in article_link:
+        article_link = news_item.find_all("a")[0].get("href")
+        if "allsides.com" not in article_link:
            tot_articles -= 1
            continue
-        counter += 1        
-        print("[+] Processing news: " + str(counter) + "/" + str(tot_articles) )
-        article_title = news_item.find('div', class_="news-title").text.strip()
+        counter += 1
+        print("[+] Processing news: " + str(counter) + "/" + str(tot_articles))
+        article_title = news_item.find("div", class_="news-title").text.strip()
        print("[*] Summarizing: " + article_link)
        # Summarize the article
        with open("link", "w+") as f:
            f.write(article_link)
+        # trunk-ignore(bandit/B605)
+        # trunk-ignore(bandit/B607)
        os.system("python summarizer.py")
        print("[OK] Done. Proceeding...")
        with open("response", "r") as f:
            article_summary = f.read().strip()
-        #with open(article_title, "w+") as f:
-             #f.write(article_summary)
+        # with open(article_title, "w+") as f:
+        # f.write(article_summary)
        # Extract the source and media bias rating
        try:
-            source_name = news_item.find('span').text
-        except:
+            source_name = news_item.find("span").text
+        except Exception:
            source_name = "Unknown"

-        try:           
-            media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower()
-        except:
+        try:
+            media_bias_rating = (
+                news_item.find("img")
+                .get("alt")
+                .replace("AllSides Media Bias Rating: ", "")
+                .lower()
+            )
+        except Exception:
            media_bias_rating = "Unknown"

        # Build the JSON
        data = {
-            'article_link': article_link,
-            'article_title': article_title,
-            'article_summary': article_summary,
-            'source_name': source_name,
-            'media_bias_rating': media_bias_rating
+            "article_link": article_link,
+            "article_title": article_title,
+            "article_summary": article_summary,
+            "source_name": source_name,
+            "media_bias_rating": media_bias_rating,
        }

        datas.append(data)

    return datas

+
 def handle_pagination(soup):
-    next_page = soup.find('a', {'rel': 'next'})
+    next_page = soup.find("a", {"rel": "next"})
    if next_page:
-        return next_page['href']
+        return next_page["href"]
    return None

+
 def main():
    url = "https://www.allsides.com/unbiased-balanced-news"
    all_data = []
@ -67,7 +78,9 @@ def main():
    while url:
        data = extract_data(url)
        all_data.extend(data)
-        url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser'))
+        url = handle_pagination(
+            BeautifulSoup(requests.get(url, timeout=5).text, "html.parser")
+        )

    # Prepare a nice CSS for the viewing page (nice and clean)
    css = """
@ -93,11 +106,11 @@ def main():
    html += "<style>" + css + "</style>"
    html += "</head><body>"
    for item in all_data:
-        html += "<h1>" + item['article_title'] + "</h1>"
-        html += "<h2>" + item['source_name'] + "</h2>"
-        html += "<h3>" + item['media_bias_rating'] + "</h3>"
-        html += "<p>" + item['article_summary'] + "</p>"
-        html += "<a href='" + item['article_link'] + "'>Read the full article</a>"
+        html += "<h1>" + item["article_title"] + "</h1>"
+        html += "<h2>" + item["source_name"] + "</h2>"
+        html += "<h3>" + item["media_bias_rating"] + "</h3>"
+        html += "<p>" + item["article_summary"] + "</p>"
+        html += "<a href='" + item["article_link"] + "'>Read the full article</a>"
        html += "<hr>"
    html += "</body></html>"
    with open("allsides.html", "w+") as f:
@ -107,15 +120,16 @@ def main():
    # Do some math to find the number of articles per bias rating
    bias_ratings = {}
    for item in all_data:
-        if item['media_bias_rating'] in bias_ratings:
-            bias_ratings[item['media_bias_rating']] += 1
+        if item["media_bias_rating"] in bias_ratings:
+            bias_ratings[item["media_bias_rating"]] += 1
        else:
-            bias_ratings[item['media_bias_rating']] = 1
+            bias_ratings[item["media_bias_rating"]] = 1
    # Assign percentages
    for key in bias_ratings:
        bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
-        
+
    print(bias_ratings)

+
 if __name__ == "__main__":
    main()
--- a/summarizer.py
+++ b/summarizer.py
@ -1,6 +1,7 @@
+import os
+
 import requests
 from dotenv import load_dotenv
-import os

 load_dotenv()

@ -10,32 +11,36 @@ model = os.getenv("MODEL")
 with open("link", "r") as f:
    article_link = f.read().strip()

-
 headers = {
-    'accept': 'application/json',
-    'authorization': 'Bearer ' + pplx_api_key,
-    'content-type': 'application/json',
+    "accept": "application/json",
+    "authorization": "Bearer " + pplx_api_key,
+    "content-type": "application/json",
 }

 json_data = {
-    'model': model,
-    'messages': [
+    "model": model,
+    "messages": [
        {
-            'role': 'system',
-            'content': 'Be precise, concise and clear',
+            "role": "system",
+            "content": "Be precise, concise and clear",
        },
        {
-            'role': 'user',
-            'content': 'Search and summarize: ' + article_link,
+            "role": "user",
+            "content": "Search and summarize: " + article_link,
        },
    ],
 }

-response = requests.post('https://api.perplexity.ai/chat/completions', headers=headers, json=json_data)
+response = requests.post(
+    "https://api.perplexity.ai/chat/completions",
+    headers=headers,
+    json=json_data,
+    timeout=5,
+)

 response = response.json()
-#print(response)
+# print(response)

-#print(response["choices"][0]["message"]["content"])
+# print(response["choices"][0]["message"]["content"])
 with open("response", "w+") as response_file:
-    response_file.write(response["choices"][0]["message"]["content"])
+    response_file.write(response["choices"][0]["message"]["content"])