mirror of
https://github.com/tcsenpai/spurelations.git
synced 2025-06-02 17:30:16 +00:00
first commit
This commit is contained in:
commit
a43ed9c899
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
tests
|
||||
images
|
||||
dist
|
||||
src/spurelations/__pycache__
|
||||
src/spurelations.egg-info
|
21
LICENSE
Normal file
21
LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 tcsenpai
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
38
README.md
Normal file
38
README.md
Normal file
@ -0,0 +1,38 @@
|
||||
# Spurelations
|
||||
|
||||
Download spurious correlations from tylervigen.com.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install spurelations
|
||||
```
|
||||
|
||||
## Quick tips
|
||||
|
||||
- Images are saved in `~/spurelations/images/`
|
||||
|
||||
## Usage
|
||||
|
||||
### Download a single correlation
|
||||
|
||||
```bash
|
||||
spurelations
|
||||
```
|
||||
|
||||
### Download all correlations until stopped
|
||||
|
||||
```bash
|
||||
spurelations --all
|
||||
```
|
||||
|
||||
### Download N correlations
|
||||
|
||||
```bash
|
||||
spurelations --n 10
|
||||
```
|
||||
|
||||
## Features
|
||||
|
||||
- Automatically extract the correlation data from the website
|
||||
- Avoid downloading the same correlation multiple times
|
26
pyproject.toml
Normal file
26
pyproject.toml
Normal file
@ -0,0 +1,26 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=45", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "spurelations"
|
||||
version = "0.1.1"
|
||||
description = "Download spurious correlations from tylervigen.com"
|
||||
readme = "README.md"
|
||||
authors = [{ name = "tcsenpai", email = "tcsenpai@discus.sh" }]
|
||||
license = { file = "LICENSE" }
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
]
|
||||
keywords = ["spurious", "correlations", "data", "visualization"]
|
||||
dependencies = ["beautifulsoup4", "requests", "tqdm", "colorama"]
|
||||
requires-python = ">=3.7"
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/tcsenpai/spurelations"
|
||||
Repository = "https://github.com/tcsenpai/spurelations.git"
|
||||
|
||||
[project.scripts]
|
||||
spurelations = "spurelations.main:main"
|
6
src/spurelations/__init__.py
Normal file
6
src/spurelations/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
"""Spurelations package."""
|
||||
|
||||
from .main import main
|
||||
|
||||
__version__ = "0.1.0"
|
||||
__all__ = ["main"]
|
162
src/spurelations/main.py
Normal file
162
src/spurelations/main.py
Normal file
@ -0,0 +1,162 @@
|
||||
"""Spurelations - Download spurious correlations from tylervigen.com."""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import os
|
||||
import shutil
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
from tqdm import tqdm
|
||||
from colorama import Fore, Style, init
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
# Initialize colorama for Windows compatibility
|
||||
init()
|
||||
|
||||
|
||||
def log_info(message):
|
||||
print(f"{Fore.CYAN}[INFO]{Style.RESET_ALL} {message}")
|
||||
|
||||
|
||||
def log_success(message):
|
||||
print(f"{Fore.GREEN}[SUCCESS]{Style.RESET_ALL} {message}")
|
||||
|
||||
|
||||
def log_warning(message):
|
||||
print(f"{Fore.YELLOW}[WARNING]{Style.RESET_ALL} {message}")
|
||||
|
||||
|
||||
def log_error(message):
|
||||
print(f"{Fore.RED}[ERROR]{Style.RESET_ALL} {message}")
|
||||
|
||||
|
||||
def extract_png_link(html_content):
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
|
||||
# Find all <a> tags that have both 'download' attribute and href ending with .png
|
||||
for link in soup.find_all("a"):
|
||||
href = link.get("href", "")
|
||||
text = link.get_text()
|
||||
if href.endswith(".png") and "Download png" in text:
|
||||
# If it's a relative URL, make it absolute
|
||||
if href.startswith("image/"):
|
||||
href = f"https://tylervigen.com/spurious/correlation/{href}"
|
||||
log_info(f"Found link: {href}")
|
||||
return href
|
||||
return None
|
||||
|
||||
|
||||
def get_png_from_page(url):
|
||||
# Create temporary directory for intermediate files
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
try:
|
||||
log_info(f"Fetching page from: {url}")
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
log_info("Successfully retrieved page")
|
||||
|
||||
# Save the random page content in temp directory
|
||||
temp_html = Path(temp_dir) / "random.html"
|
||||
with open(temp_html, "w", encoding="utf-8") as f:
|
||||
f.write(response.text)
|
||||
log_info("Saved HTML content to temporary file")
|
||||
|
||||
png_link = extract_png_link(response.text)
|
||||
if png_link:
|
||||
log_info(f"Found PNG link: {png_link}")
|
||||
|
||||
# Create images directory in user's home
|
||||
images_dir = Path.home() / "spurelations" / "images"
|
||||
images_dir.mkdir(parents=True, exist_ok=True)
|
||||
log_info(f"Ensured '{images_dir}' directory exists")
|
||||
|
||||
# Extract filename from the PNG URL
|
||||
filename = png_link.split("/")[-1]
|
||||
filepath = images_dir / filename
|
||||
|
||||
# Check if file already exists
|
||||
if filepath.exists():
|
||||
log_warning(f"File already exists: {filepath}")
|
||||
return "EXISTS"
|
||||
|
||||
# Download and save the PNG
|
||||
log_info(f"Downloading PNG from: {png_link}")
|
||||
png_response = requests.get(png_link, stream=True)
|
||||
if png_response.status_code == 200:
|
||||
with open(filepath, "wb") as f:
|
||||
shutil.copyfileobj(png_response.raw, f)
|
||||
log_success(f"Successfully saved PNG to: {filepath}")
|
||||
|
||||
return "SUCCESS"
|
||||
else:
|
||||
log_warning("No PNG download link found in the page")
|
||||
return "NO_LINK"
|
||||
except Exception as e:
|
||||
log_error(f"Error: {str(e)}")
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Download random correlation PNGs")
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
"--all", action="store_true", help="Download images until Ctrl+C"
|
||||
)
|
||||
group.add_argument("--num", type=int, help="Download N images")
|
||||
args = parser.parse_args()
|
||||
|
||||
url = "https://tylervigen.com/spurious/random"
|
||||
exists_count = 0
|
||||
downloaded = 0
|
||||
|
||||
try:
|
||||
if args.num:
|
||||
# Use tqdm for progress bar when --num is specified
|
||||
pbar = tqdm(total=args.num, desc="Downloading images")
|
||||
|
||||
while True:
|
||||
print(f"\n{Fore.BLUE}{'='*50}{Style.RESET_ALL}")
|
||||
log_info("Starting PNG extraction process...")
|
||||
result = get_png_from_page(url)
|
||||
|
||||
if result == "EXISTS":
|
||||
exists_count += 1
|
||||
if exists_count >= 10:
|
||||
log_warning("\nReached 10 existing files, stopping...")
|
||||
break
|
||||
elif result == "SUCCESS":
|
||||
downloaded += 1
|
||||
exists_count = 0 # Reset counter on successful download
|
||||
|
||||
if args.num:
|
||||
pbar.update(1)
|
||||
if downloaded >= args.num:
|
||||
log_success(
|
||||
f"\nReached target of {args.num} downloads, stopping..."
|
||||
)
|
||||
break
|
||||
|
||||
if not args.all and not args.num:
|
||||
break
|
||||
|
||||
# Add a small delay between requests
|
||||
time.sleep(1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
log_warning("\nProcess interrupted by user")
|
||||
finally:
|
||||
if args.num:
|
||||
pbar.close()
|
||||
|
||||
print(f"\n{Fore.BLUE}{'='*50}{Style.RESET_ALL}")
|
||||
log_success(f"Download summary:")
|
||||
log_info(f"Successfully downloaded: {downloaded} images")
|
||||
log_info(f"Stopped after encountering: {exists_count} existing files")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user