From a43ed9c899ae12d1a3fbd67dba10ef15707a57ce Mon Sep 17 00:00:00 2001 From: tcsenpai Date: Wed, 1 Jan 2025 13:33:44 +0100 Subject: [PATCH] first commit --- .gitignore | 5 ++ LICENSE | 21 +++++ README.md | 38 ++++++++ pyproject.toml | 26 ++++++ src/spurelations/__init__.py | 6 ++ src/spurelations/main.py | 162 +++++++++++++++++++++++++++++++++++ 6 files changed, 258 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 pyproject.toml create mode 100644 src/spurelations/__init__.py create mode 100644 src/spurelations/main.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..795b551 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +tests +images +dist +src/spurelations/__pycache__ +src/spurelations.egg-info \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d85a048 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 tcsenpai + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a910828 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# Spurelations + +Download spurious correlations from tylervigen.com. + +## Installation + +```bash +pip install spurelations +``` + +## Quick tips + +- Images are saved in `~/spurelations/images/` + +## Usage + +### Download a single correlation + +```bash +spurelations +``` + +### Download all correlations until stopped + +```bash +spurelations --all +``` + +### Download N correlations + +```bash +spurelations --n 10 +``` + +## Features + +- Automatically extract the correlation data from the website +- Avoid downloading the same correlation multiple times diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1486c91 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,26 @@ +[build-system] +requires = ["setuptools>=45", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "spurelations" +version = "0.1.1" +description = "Download spurious correlations from tylervigen.com" +readme = "README.md" +authors = [{ name = "tcsenpai", email = "tcsenpai@discus.sh" }] +license = { file = "LICENSE" } +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] +keywords = ["spurious", "correlations", "data", "visualization"] +dependencies = ["beautifulsoup4", "requests", "tqdm", "colorama"] +requires-python = ">=3.7" + +[project.urls] +Homepage = "https://github.com/tcsenpai/spurelations" +Repository = "https://github.com/tcsenpai/spurelations.git" + +[project.scripts] +spurelations = "spurelations.main:main" diff --git a/src/spurelations/__init__.py b/src/spurelations/__init__.py new file mode 100644 index 0000000..5c9f0d5 --- /dev/null +++ b/src/spurelations/__init__.py @@ -0,0 +1,6 @@ +"""Spurelations package.""" + +from .main import main + +__version__ = "0.1.0" +__all__ = ["main"] diff --git a/src/spurelations/main.py b/src/spurelations/main.py new file mode 100644 index 0000000..1376ba3 --- /dev/null +++ b/src/spurelations/main.py @@ -0,0 +1,162 @@ +"""Spurelations - Download spurious correlations from tylervigen.com.""" + +__version__ = "0.1.0" + +from bs4 import BeautifulSoup +import requests +import os +import shutil +import argparse +import sys +import time +from tqdm import tqdm +from colorama import Fore, Style, init +import tempfile +from pathlib import Path + +# Initialize colorama for Windows compatibility +init() + + +def log_info(message): + print(f"{Fore.CYAN}[INFO]{Style.RESET_ALL} {message}") + + +def log_success(message): + print(f"{Fore.GREEN}[SUCCESS]{Style.RESET_ALL} {message}") + + +def log_warning(message): + print(f"{Fore.YELLOW}[WARNING]{Style.RESET_ALL} {message}") + + +def log_error(message): + print(f"{Fore.RED}[ERROR]{Style.RESET_ALL} {message}") + + +def extract_png_link(html_content): + soup = BeautifulSoup(html_content, "html.parser") + + # Find all tags that have both 'download' attribute and href ending with .png + for link in soup.find_all("a"): + href = link.get("href", "") + text = link.get_text() + if href.endswith(".png") and "Download png" in text: + # If it's a relative URL, make it absolute + if href.startswith("image/"): + href = f"https://tylervigen.com/spurious/correlation/{href}" + log_info(f"Found link: {href}") + return href + return None + + +def get_png_from_page(url): + # Create temporary directory for intermediate files + with tempfile.TemporaryDirectory() as temp_dir: + try: + log_info(f"Fetching page from: {url}") + response = requests.get(url) + if response.status_code == 200: + log_info("Successfully retrieved page") + + # Save the random page content in temp directory + temp_html = Path(temp_dir) / "random.html" + with open(temp_html, "w", encoding="utf-8") as f: + f.write(response.text) + log_info("Saved HTML content to temporary file") + + png_link = extract_png_link(response.text) + if png_link: + log_info(f"Found PNG link: {png_link}") + + # Create images directory in user's home + images_dir = Path.home() / "spurelations" / "images" + images_dir.mkdir(parents=True, exist_ok=True) + log_info(f"Ensured '{images_dir}' directory exists") + + # Extract filename from the PNG URL + filename = png_link.split("/")[-1] + filepath = images_dir / filename + + # Check if file already exists + if filepath.exists(): + log_warning(f"File already exists: {filepath}") + return "EXISTS" + + # Download and save the PNG + log_info(f"Downloading PNG from: {png_link}") + png_response = requests.get(png_link, stream=True) + if png_response.status_code == 200: + with open(filepath, "wb") as f: + shutil.copyfileobj(png_response.raw, f) + log_success(f"Successfully saved PNG to: {filepath}") + + return "SUCCESS" + else: + log_warning("No PNG download link found in the page") + return "NO_LINK" + except Exception as e: + log_error(f"Error: {str(e)}") + return f"Error: {str(e)}" + + +def main(): + parser = argparse.ArgumentParser(description="Download random correlation PNGs") + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--all", action="store_true", help="Download images until Ctrl+C" + ) + group.add_argument("--num", type=int, help="Download N images") + args = parser.parse_args() + + url = "https://tylervigen.com/spurious/random" + exists_count = 0 + downloaded = 0 + + try: + if args.num: + # Use tqdm for progress bar when --num is specified + pbar = tqdm(total=args.num, desc="Downloading images") + + while True: + print(f"\n{Fore.BLUE}{'='*50}{Style.RESET_ALL}") + log_info("Starting PNG extraction process...") + result = get_png_from_page(url) + + if result == "EXISTS": + exists_count += 1 + if exists_count >= 10: + log_warning("\nReached 10 existing files, stopping...") + break + elif result == "SUCCESS": + downloaded += 1 + exists_count = 0 # Reset counter on successful download + + if args.num: + pbar.update(1) + if downloaded >= args.num: + log_success( + f"\nReached target of {args.num} downloads, stopping..." + ) + break + + if not args.all and not args.num: + break + + # Add a small delay between requests + time.sleep(1) + + except KeyboardInterrupt: + log_warning("\nProcess interrupted by user") + finally: + if args.num: + pbar.close() + + print(f"\n{Fore.BLUE}{'='*50}{Style.RESET_ALL}") + log_success(f"Download summary:") + log_info(f"Successfully downloaded: {downloaded} images") + log_info(f"Stopped after encountering: {exists_count} existing files") + + +if __name__ == "__main__": + main()