diff --git a/README.md b/README.md index 6627cfb..3ad9bfc 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,12 @@ I maintain a daily updated (if I use this software) archive section. This is mad ## Disclaimer -MySides is a personal tool designed to scrape news from APNews. Please note that all material downloaded, used, and reworked by this software is the property of APNews. This tool aims to provide a quick overview of daily news. For more information, please refer to the APNews Terms of Service. +MySides is a personal tool designed to scrape news from various sources. Please note that all material downloaded, used, and reworked by this software is the property of various sources. This tool aims to provide a quick overview of daily news. For more information, please refer to the various sources Terms of Service. + +## Built-in sites + +[x] APNews (world news) +[x] EuObserver (rss feed) ## Work In Progress @@ -17,7 +22,7 @@ Stay tuned. ## TLDR -MySides scrapes the latest news from APNews and uses Perplexity AI APIs to summarize them into a concise, single-page format. +MySides scrapes the latest news from various sources and uses Perplexity AI APIs to summarize them into a concise, single-page format. ## Perplexity AI? diff --git a/__pycache__/apnews.cpython-311.pyc b/__pycache__/apnews.cpython-311.pyc index 8c37f1a..83c1c6a 100644 Binary files a/__pycache__/apnews.cpython-311.pyc and b/__pycache__/apnews.cpython-311.pyc differ diff --git a/apnews.py b/apnews.py index b79d89c..276bee9 100644 --- a/apnews.py +++ b/apnews.py @@ -1,6 +1,13 @@ +from bs4 import BeautifulSoup +import requests -def fetchAndDigest(soup): +def getSoup(): + response = requests.get("https://apnews.com/world-news", timeout=5) + soup = BeautifulSoup(response.text, "html.parser") + return soup +def fetchAndDigest(): + soup = getSoup() news_items = soup.find_all("div", class_="PagePromo") print("[+] Filtering out invalid articles...") links = [] diff --git a/euobserver.py b/euobserver.py new file mode 100644 index 0000000..1776476 --- /dev/null +++ b/euobserver.py @@ -0,0 +1,13 @@ +import feedparser + +def fetchAndDigest(): + links = [] + feed = feedparser.parse("https://xml.euobserver.com/rss.xml") + for entry in feed.entries: + + article_title = entry.title + article_link = entry.link + links.append([article_title, article_link]) + + print("[+] Total news: " + str(len(links))) + return links \ No newline at end of file diff --git a/main.py b/main.py index 9de5d15..ac4c7bb 100644 --- a/main.py +++ b/main.py @@ -1,48 +1,17 @@ import os -import requests -from bs4 import BeautifulSoup from dotenv import load_dotenv # Our modules import apnews +import euobserver import summarizer load_dotenv() # Loading environment variables -news_type = os.getenv("NEWS") pplx_api_key = os.getenv("PPLX_API_KEY") model = os.getenv("MODEL") -# Main menu -def menu(): - global news_type - available_news = os.getenv("POSSIBLE_NEWS_VALUES") - available_news = available_news.split(",") - print("[ Welcome to MySides ]") - print("[+] Available news: ") - counter = 0 - for avail in available_news: - counter += 1 - print(str(counter) + ") " + avail.strip().replace('"', "")) - - print("[+] Current news: " + news_type) - print("[+] Press enter to continue or type a number to change the news type.") - news_type_n = input().strip() - if news_type_n == "": - return - try: - news_type_n = int(news_type_n) - except Exception: - menu() - print("[!] Invalid news type.") - news_type_n -= 1 - try: - news_type = available_news[news_type_n] - except Exception: - menu() - print("[!] Invalid news type.") - # Fetch and summarize the article def transform_links(links): datas = [] @@ -70,10 +39,14 @@ def transform_links(links): return datas # Downloads the site and extracting the data using the appropriate module -def extract_data(url): - response = requests.get(url, timeout=5) - soup = BeautifulSoup(response.text, "html.parser") - links = apnews.fetchAndDigest(soup) +def extract_data(): + links = [] + + # Plug in your module here (links.extend(your_module.fetchAndDigest()) + links.extend(apnews.fetchAndDigest()) + links.extend(euobserver.fetchAndDigest()) + + print("[+] Total news: " + str(len(links))) datas = transform_links(links) return datas @@ -85,16 +58,10 @@ def handle_pagination(soup): def main(): - global news_type - url = "https://apnews.com/" + news_type all_data = [] - while url: - datas = extract_data(url) - all_data.extend(datas) - url = handle_pagination( - BeautifulSoup(requests.get(url, timeout=5).text, "html.parser") - ) + datas = extract_data() + all_data.extend(datas) # Prepare a nice CSS for the viewing page (nice and clean) css = """ @@ -116,7 +83,7 @@ def main(): """ # Create a nice HTML view of all the articles each one in its own page - html = "