From a43ed9c899ae12d1a3fbd67dba10ef15707a57ce Mon Sep 17 00:00:00 2001
From: tcsenpai <dev@tcsenpai.com>
Date: Wed, 1 Jan 2025 13:33:44 +0100
Subject: [PATCH] first commit

---
 .gitignore                   |   5 ++
 LICENSE                      |  21 +++++
 README.md                    |  38 ++++++++
 pyproject.toml               |  26 ++++++
 src/spurelations/__init__.py |   6 ++
 src/spurelations/main.py     | 162 +++++++++++++++++++++++++++++++++++
 6 files changed, 258 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 pyproject.toml
 create mode 100644 src/spurelations/__init__.py
 create mode 100644 src/spurelations/main.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..795b551
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+tests
+images
+dist
+src/spurelations/__pycache__
+src/spurelations.egg-info
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d85a048
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 tcsenpai
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..a910828
--- /dev/null
+++ b/README.md
@@ -0,0 +1,38 @@
+# Spurelations
+
+Download spurious correlations from tylervigen.com.
+
+## Installation
+
+```bash
+pip install spurelations
+```
+
+## Quick tips
+
+- Images are saved in `~/spurelations/images/`
+
+## Usage
+
+### Download a single correlation
+
+```bash
+spurelations
+```
+
+### Download all correlations until stopped
+
+```bash
+spurelations --all
+```
+
+### Download N correlations
+
+```bash
+spurelations --n 10
+```
+
+## Features
+
+- Automatically extract the correlation data from the website
+- Avoid downloading the same correlation multiple times
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..1486c91
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,26 @@
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "spurelations"
+version = "0.1.1"
+description = "Download spurious correlations from tylervigen.com"
+readme = "README.md"
+authors = [{ name = "tcsenpai", email = "tcsenpai@discus.sh" }]
+license = { file = "LICENSE" }
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+keywords = ["spurious", "correlations", "data", "visualization"]
+dependencies = ["beautifulsoup4", "requests", "tqdm", "colorama"]
+requires-python = ">=3.7"
+
+[project.urls]
+Homepage = "https://github.com/tcsenpai/spurelations"
+Repository = "https://github.com/tcsenpai/spurelations.git"
+
+[project.scripts]
+spurelations = "spurelations.main:main"
diff --git a/src/spurelations/__init__.py b/src/spurelations/__init__.py
new file mode 100644
index 0000000..5c9f0d5
--- /dev/null
+++ b/src/spurelations/__init__.py
@@ -0,0 +1,6 @@
+"""Spurelations package."""
+
+from .main import main
+
+__version__ = "0.1.0"
+__all__ = ["main"]
diff --git a/src/spurelations/main.py b/src/spurelations/main.py
new file mode 100644
index 0000000..1376ba3
--- /dev/null
+++ b/src/spurelations/main.py
@@ -0,0 +1,162 @@
+"""Spurelations - Download spurious correlations from tylervigen.com."""
+
+__version__ = "0.1.0"
+
+from bs4 import BeautifulSoup
+import requests
+import os
+import shutil
+import argparse
+import sys
+import time
+from tqdm import tqdm
+from colorama import Fore, Style, init
+import tempfile
+from pathlib import Path
+
+# Initialize colorama for Windows compatibility
+init()
+
+
+def log_info(message):
+    print(f"{Fore.CYAN}[INFO]{Style.RESET_ALL} {message}")
+
+
+def log_success(message):
+    print(f"{Fore.GREEN}[SUCCESS]{Style.RESET_ALL} {message}")
+
+
+def log_warning(message):
+    print(f"{Fore.YELLOW}[WARNING]{Style.RESET_ALL} {message}")
+
+
+def log_error(message):
+    print(f"{Fore.RED}[ERROR]{Style.RESET_ALL} {message}")
+
+
+def extract_png_link(html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+
+    # Find all <a> tags that have both 'download' attribute and href ending with .png
+    for link in soup.find_all("a"):
+        href = link.get("href", "")
+        text = link.get_text()
+        if href.endswith(".png") and "Download png" in text:
+            # If it's a relative URL, make it absolute
+            if href.startswith("image/"):
+                href = f"https://tylervigen.com/spurious/correlation/{href}"
+            log_info(f"Found link: {href}")
+            return href
+    return None
+
+
+def get_png_from_page(url):
+    # Create temporary directory for intermediate files
+    with tempfile.TemporaryDirectory() as temp_dir:
+        try:
+            log_info(f"Fetching page from: {url}")
+            response = requests.get(url)
+            if response.status_code == 200:
+                log_info("Successfully retrieved page")
+
+                # Save the random page content in temp directory
+                temp_html = Path(temp_dir) / "random.html"
+                with open(temp_html, "w", encoding="utf-8") as f:
+                    f.write(response.text)
+                log_info("Saved HTML content to temporary file")
+
+                png_link = extract_png_link(response.text)
+                if png_link:
+                    log_info(f"Found PNG link: {png_link}")
+
+                    # Create images directory in user's home
+                    images_dir = Path.home() / "spurelations" / "images"
+                    images_dir.mkdir(parents=True, exist_ok=True)
+                    log_info(f"Ensured '{images_dir}' directory exists")
+
+                    # Extract filename from the PNG URL
+                    filename = png_link.split("/")[-1]
+                    filepath = images_dir / filename
+
+                    # Check if file already exists
+                    if filepath.exists():
+                        log_warning(f"File already exists: {filepath}")
+                        return "EXISTS"
+
+                    # Download and save the PNG
+                    log_info(f"Downloading PNG from: {png_link}")
+                    png_response = requests.get(png_link, stream=True)
+                    if png_response.status_code == 200:
+                        with open(filepath, "wb") as f:
+                            shutil.copyfileobj(png_response.raw, f)
+                        log_success(f"Successfully saved PNG to: {filepath}")
+
+                    return "SUCCESS"
+                else:
+                    log_warning("No PNG download link found in the page")
+                    return "NO_LINK"
+        except Exception as e:
+            log_error(f"Error: {str(e)}")
+            return f"Error: {str(e)}"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download random correlation PNGs")
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--all", action="store_true", help="Download images until Ctrl+C"
+    )
+    group.add_argument("--num", type=int, help="Download N images")
+    args = parser.parse_args()
+
+    url = "https://tylervigen.com/spurious/random"
+    exists_count = 0
+    downloaded = 0
+
+    try:
+        if args.num:
+            # Use tqdm for progress bar when --num is specified
+            pbar = tqdm(total=args.num, desc="Downloading images")
+
+        while True:
+            print(f"\n{Fore.BLUE}{'='*50}{Style.RESET_ALL}")
+            log_info("Starting PNG extraction process...")
+            result = get_png_from_page(url)
+
+            if result == "EXISTS":
+                exists_count += 1
+                if exists_count >= 10:
+                    log_warning("\nReached 10 existing files, stopping...")
+                    break
+            elif result == "SUCCESS":
+                downloaded += 1
+                exists_count = 0  # Reset counter on successful download
+
+                if args.num:
+                    pbar.update(1)
+                    if downloaded >= args.num:
+                        log_success(
+                            f"\nReached target of {args.num} downloads, stopping..."
+                        )
+                        break
+
+            if not args.all and not args.num:
+                break
+
+            # Add a small delay between requests
+            time.sleep(1)
+
+    except KeyboardInterrupt:
+        log_warning("\nProcess interrupted by user")
+    finally:
+        if args.num:
+            pbar.close()
+
+    print(f"\n{Fore.BLUE}{'='*50}{Style.RESET_ALL}")
+    log_success(f"Download summary:")
+    log_info(f"Successfully downloaded: {downloaded} images")
+    log_info(f"Stopped after encountering: {exists_count} existing files")
+
+
+if __name__ == "__main__":
+    main()