mirror of
https://github.com/Arrowar/StreamingCommunity.git
synced 2025-06-07 12:05:35 +00:00
86 lines
2.6 KiB
Python
86 lines
2.6 KiB
Python
# 29.04.24
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
# URL of the webpage containing the table
|
|
url = 'https://icannwiki.org/All_New_gTLD_Applications'
|
|
|
|
|
|
# List to store scraped data
|
|
data = []
|
|
|
|
|
|
# List of preference notes and registries
|
|
preference_note = ['DELEGATED', 'WITHDRAWN', '']
|
|
preference_registry = ['Verisign', 'KSregistry', 'KNET']
|
|
|
|
|
|
# Function to scrape new gTLD applications
|
|
def scrape_new_gtld_applications(url):
|
|
|
|
# Send a GET request to the URL
|
|
response = httpx.get(url)
|
|
|
|
# Check if the response is successful
|
|
if response.status_code == 200:
|
|
|
|
# Parse the HTML content of the page
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Find the table containing the gTLD applications
|
|
table = soup.find('table', class_='wikitable')
|
|
|
|
# Check if the table is found
|
|
if table:
|
|
|
|
# Iterate over each row in the table, skipping the header row
|
|
for row in table.find_all('tr')[1:]:
|
|
columns = row.find_all('td')
|
|
|
|
# Extract data from each column of the row
|
|
application_id = columns[0].get_text(strip=True)
|
|
tld = columns[1].get_text(strip=True)
|
|
category = columns[2].get_text(strip=True)
|
|
applicant = columns[3].get_text(strip=True)
|
|
application_status = columns[4].get_text(strip=True)
|
|
notes = columns[5].get_text(strip=True).split(" ")[0]
|
|
|
|
# Check if the note is in the preference list
|
|
if notes in preference_note:
|
|
|
|
# Check if the applicant is not in the preference registry list
|
|
if applicant not in preference_registry:
|
|
|
|
# Add the extracted data to the list
|
|
data.append({
|
|
'application_id': application_id,
|
|
'tld': tld,
|
|
'category': category,
|
|
'applicant': applicant,
|
|
'application_status': application_status,
|
|
'notes': notes
|
|
})
|
|
else:
|
|
print("Table not found on the page.")
|
|
else:
|
|
print("Failed to fetch the page.")
|
|
|
|
|
|
def main():
|
|
|
|
# Run main functio
|
|
scrape_new_gtld_applications(url)
|
|
|
|
# Print the number of records scraped
|
|
print(len(data))
|
|
|
|
# Write the scraped data to a JSON file
|
|
with open('data.txt', 'w') as json_file:
|
|
for find_tld in data:
|
|
json_file.write(find_tld['application_id'] + "\n")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |