import os import requests from bs4 import BeautifulSoup from dotenv import load_dotenv # Our modules import apnews import summarizer load_dotenv() # Loading environment variables news_type = os.getenv("NEWS") pplx_api_key = os.getenv("PPLX_API_KEY") model = os.getenv("MODEL") # Main menu def menu(): global news_type available_news = os.getenv("POSSIBLE_NEWS_VALUES") available_news = available_news.split(",") print("[ Welcome to MySides ]") print("[+] Available news: ") counter = 0 for avail in available_news: counter += 1 print(str(counter) + ") " + avail.strip().replace('"', "")) print("[+] Current news: " + news_type) print("[+] Press enter to continue or type a number to change the news type.") news_type_n = input().strip() if news_type_n == "": return try: news_type_n = int(news_type_n) except Exception: menu() print("[!] Invalid news type.") news_type_n -= 1 try: news_type = available_news[news_type_n] except Exception: menu() print("[!] Invalid news type.") # Fetch and summarize the article def transform_links(links): datas = [] counter = 0 print("[+] Extracting data from articles...") for link in links: counter += 1 print("[+] Article " + str(counter) + " of " + str(len(links))) article_title = link[0] article_link = link[1] print("[ " + article_title + " ]") print("[+] Extracting data from: " + article_link) try: article_summary = summarizer.summarize(article_link, pplx_api_key, model) except Exception as e: print(e) print("[!] Invalid article. Skipping...") continue datas.append( { "article_title": article_title, "article_link": article_link, "article_summary": article_summary, }) return datas # Downloads the site and extracting the data using the appropriate module def extract_data(url): response = requests.get(url, timeout=5) soup = BeautifulSoup(response.text, "html.parser") links = apnews.fetchAndDigest(soup) datas = transform_links(links) return datas def handle_pagination(soup): next_page = soup.find("a", {"rel": "next"}) if next_page: return next_page["href"] return None def main(): global news_type url = "https://apnews.com/" + news_type all_data = [] while url: datas = extract_data(url) all_data.extend(datas) url = handle_pagination( BeautifulSoup(requests.get(url, timeout=5).text, "html.parser") ) # Prepare a nice CSS for the viewing page (nice and clean) css = """ body { font-family: sans-serif (Helvetica, Arial); } h1 { font-size: 2em; } h2 { font-size: 1.5em; } h3 { font-size: 1.2em; } p { font-size: 1em; } """ # Create a nice HTML view of all the articles each one in its own page html = "APNews Unbiased News" html += "" html += "" for item in all_data: html += "

" + item["article_title"] + "

" html += "

" + item["article_summary"] + "

" html += "Read the full article" html += "
" html += "" with open("ap.html", "w+") as f: f.write(html) # Archiving (skip if causes errors) os.system("./archiver.sh") print("Total articles: ", len(all_data)) if __name__ == "__main__": menu() print("[+] News type: " + news_type) main()