mirror of
https://github.com/tcsenpai/spurelations.git
synced 2025-06-06 19:25:32 +00:00
first commit
This commit is contained in:
commit
a43ed9c899
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
tests
|
||||||
|
images
|
||||||
|
dist
|
||||||
|
src/spurelations/__pycache__
|
||||||
|
src/spurelations.egg-info
|
21
LICENSE
Normal file
21
LICENSE
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2025 tcsenpai
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
38
README.md
Normal file
38
README.md
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
# Spurelations
|
||||||
|
|
||||||
|
Download spurious correlations from tylervigen.com.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install spurelations
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick tips
|
||||||
|
|
||||||
|
- Images are saved in `~/spurelations/images/`
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Download a single correlation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
spurelations
|
||||||
|
```
|
||||||
|
|
||||||
|
### Download all correlations until stopped
|
||||||
|
|
||||||
|
```bash
|
||||||
|
spurelations --all
|
||||||
|
```
|
||||||
|
|
||||||
|
### Download N correlations
|
||||||
|
|
||||||
|
```bash
|
||||||
|
spurelations --n 10
|
||||||
|
```
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Automatically extract the correlation data from the website
|
||||||
|
- Avoid downloading the same correlation multiple times
|
26
pyproject.toml
Normal file
26
pyproject.toml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=45", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "spurelations"
|
||||||
|
version = "0.1.1"
|
||||||
|
description = "Download spurious correlations from tylervigen.com"
|
||||||
|
readme = "README.md"
|
||||||
|
authors = [{ name = "tcsenpai", email = "tcsenpai@discus.sh" }]
|
||||||
|
license = { file = "LICENSE" }
|
||||||
|
classifiers = [
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
]
|
||||||
|
keywords = ["spurious", "correlations", "data", "visualization"]
|
||||||
|
dependencies = ["beautifulsoup4", "requests", "tqdm", "colorama"]
|
||||||
|
requires-python = ">=3.7"
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://github.com/tcsenpai/spurelations"
|
||||||
|
Repository = "https://github.com/tcsenpai/spurelations.git"
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
spurelations = "spurelations.main:main"
|
6
src/spurelations/__init__.py
Normal file
6
src/spurelations/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
"""Spurelations package."""
|
||||||
|
|
||||||
|
from .main import main
|
||||||
|
|
||||||
|
__version__ = "0.1.0"
|
||||||
|
__all__ = ["main"]
|
162
src/spurelations/main.py
Normal file
162
src/spurelations/main.py
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
"""Spurelations - Download spurious correlations from tylervigen.com."""
|
||||||
|
|
||||||
|
__version__ = "0.1.0"
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from tqdm import tqdm
|
||||||
|
from colorama import Fore, Style, init
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Initialize colorama for Windows compatibility
|
||||||
|
init()
|
||||||
|
|
||||||
|
|
||||||
|
def log_info(message):
|
||||||
|
print(f"{Fore.CYAN}[INFO]{Style.RESET_ALL} {message}")
|
||||||
|
|
||||||
|
|
||||||
|
def log_success(message):
|
||||||
|
print(f"{Fore.GREEN}[SUCCESS]{Style.RESET_ALL} {message}")
|
||||||
|
|
||||||
|
|
||||||
|
def log_warning(message):
|
||||||
|
print(f"{Fore.YELLOW}[WARNING]{Style.RESET_ALL} {message}")
|
||||||
|
|
||||||
|
|
||||||
|
def log_error(message):
|
||||||
|
print(f"{Fore.RED}[ERROR]{Style.RESET_ALL} {message}")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_png_link(html_content):
|
||||||
|
soup = BeautifulSoup(html_content, "html.parser")
|
||||||
|
|
||||||
|
# Find all <a> tags that have both 'download' attribute and href ending with .png
|
||||||
|
for link in soup.find_all("a"):
|
||||||
|
href = link.get("href", "")
|
||||||
|
text = link.get_text()
|
||||||
|
if href.endswith(".png") and "Download png" in text:
|
||||||
|
# If it's a relative URL, make it absolute
|
||||||
|
if href.startswith("image/"):
|
||||||
|
href = f"https://tylervigen.com/spurious/correlation/{href}"
|
||||||
|
log_info(f"Found link: {href}")
|
||||||
|
return href
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_png_from_page(url):
|
||||||
|
# Create temporary directory for intermediate files
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
try:
|
||||||
|
log_info(f"Fetching page from: {url}")
|
||||||
|
response = requests.get(url)
|
||||||
|
if response.status_code == 200:
|
||||||
|
log_info("Successfully retrieved page")
|
||||||
|
|
||||||
|
# Save the random page content in temp directory
|
||||||
|
temp_html = Path(temp_dir) / "random.html"
|
||||||
|
with open(temp_html, "w", encoding="utf-8") as f:
|
||||||
|
f.write(response.text)
|
||||||
|
log_info("Saved HTML content to temporary file")
|
||||||
|
|
||||||
|
png_link = extract_png_link(response.text)
|
||||||
|
if png_link:
|
||||||
|
log_info(f"Found PNG link: {png_link}")
|
||||||
|
|
||||||
|
# Create images directory in user's home
|
||||||
|
images_dir = Path.home() / "spurelations" / "images"
|
||||||
|
images_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
log_info(f"Ensured '{images_dir}' directory exists")
|
||||||
|
|
||||||
|
# Extract filename from the PNG URL
|
||||||
|
filename = png_link.split("/")[-1]
|
||||||
|
filepath = images_dir / filename
|
||||||
|
|
||||||
|
# Check if file already exists
|
||||||
|
if filepath.exists():
|
||||||
|
log_warning(f"File already exists: {filepath}")
|
||||||
|
return "EXISTS"
|
||||||
|
|
||||||
|
# Download and save the PNG
|
||||||
|
log_info(f"Downloading PNG from: {png_link}")
|
||||||
|
png_response = requests.get(png_link, stream=True)
|
||||||
|
if png_response.status_code == 200:
|
||||||
|
with open(filepath, "wb") as f:
|
||||||
|
shutil.copyfileobj(png_response.raw, f)
|
||||||
|
log_success(f"Successfully saved PNG to: {filepath}")
|
||||||
|
|
||||||
|
return "SUCCESS"
|
||||||
|
else:
|
||||||
|
log_warning("No PNG download link found in the page")
|
||||||
|
return "NO_LINK"
|
||||||
|
except Exception as e:
|
||||||
|
log_error(f"Error: {str(e)}")
|
||||||
|
return f"Error: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Download random correlation PNGs")
|
||||||
|
group = parser.add_mutually_exclusive_group()
|
||||||
|
group.add_argument(
|
||||||
|
"--all", action="store_true", help="Download images until Ctrl+C"
|
||||||
|
)
|
||||||
|
group.add_argument("--num", type=int, help="Download N images")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
url = "https://tylervigen.com/spurious/random"
|
||||||
|
exists_count = 0
|
||||||
|
downloaded = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
if args.num:
|
||||||
|
# Use tqdm for progress bar when --num is specified
|
||||||
|
pbar = tqdm(total=args.num, desc="Downloading images")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print(f"\n{Fore.BLUE}{'='*50}{Style.RESET_ALL}")
|
||||||
|
log_info("Starting PNG extraction process...")
|
||||||
|
result = get_png_from_page(url)
|
||||||
|
|
||||||
|
if result == "EXISTS":
|
||||||
|
exists_count += 1
|
||||||
|
if exists_count >= 10:
|
||||||
|
log_warning("\nReached 10 existing files, stopping...")
|
||||||
|
break
|
||||||
|
elif result == "SUCCESS":
|
||||||
|
downloaded += 1
|
||||||
|
exists_count = 0 # Reset counter on successful download
|
||||||
|
|
||||||
|
if args.num:
|
||||||
|
pbar.update(1)
|
||||||
|
if downloaded >= args.num:
|
||||||
|
log_success(
|
||||||
|
f"\nReached target of {args.num} downloads, stopping..."
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
if not args.all and not args.num:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Add a small delay between requests
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
log_warning("\nProcess interrupted by user")
|
||||||
|
finally:
|
||||||
|
if args.num:
|
||||||
|
pbar.close()
|
||||||
|
|
||||||
|
print(f"\n{Fore.BLUE}{'='*50}{Style.RESET_ALL}")
|
||||||
|
log_success(f"Download summary:")
|
||||||
|
log_info(f"Successfully downloaded: {downloaded} images")
|
||||||
|
log_info(f"Stopped after encountering: {exists_count} existing files")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
x
Reference in New Issue
Block a user