Web Scraper Application

Automated ski resort snow report data collection

Python Selenium BeautifulSoup Pandas Automation
Back to Projects

Project Overview

This project provides automated scripts to collect real-time snow report data from two different websites: Colorado Ski and OnTheSnow. By leveraging Python's Selenium library and BeautifulSoup, the scraper captures details such as snowfall, base depth, open runs, and lifts open. The collected data are consolidated into CSV files for further analysis or integration into dashboards.

Objectives

Key Results

Methodology

Skills Demonstrated

Technologies Used

Python Selenium BeautifulSoup Pandas Web Scraping Automation

Scripts

Below is an abbreviated version of the two scripts used to scrape data from coloradoski.com and onthesnow.com.

Colorado Ski Scraper

import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

driver_path = "C:/path/to/chromedriver.exe"
csv_file_path = "C:/path/to/snow_report_data_coloradoski.csv"

# Set up and launch Chrome
service = Service(driver_path)
driver = webdriver.Chrome(service=service)
driver.get("https://www.coloradoski.com/snow-report/")

# Wait until mid-mountain depth loads
WebDriverWait(driver, 30).until(
    lambda d: any(
        mid.text.strip() != '0"'
        for mid in d.find_elements(By.CLASS_NAME, "answer.mid-mtn")
    )
)

html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
driver.quit()

snow_data = []
resorts = soup.find_all("div", class_="inner")

for resort in resorts:
    name_tag = resort.find("h3", class_="h5 text-left")
    if not name_tag:
        continue

    name = name_tag.text.strip()
    snow_24hr = resort.find("span", class_="answer twentyfour").text.strip() \
                 if resort.find("span", class_="answer twentyfour") else "N/A"
    # ... repeat for other fields ...

    snow_data.append({
        "Date": datetime.now().strftime("%Y-%m-%d"),
        "Resort": name,
        "Snow (24hr)": snow_24hr,
        # ...
    })

new_data_df = pd.DataFrame(snow_data)

# Append or create CSV
if os.path.exists(csv_file_path):
    existing_data_df = pd.read_csv(csv_file_path)
    combined_df = pd.concat([existing_data_df, new_data_df], ignore_index=True)
else:
    combined_df = new_data_df

combined_df.to_csv(csv_file_path, index=False)
print(new_data_df)

OnTheSnow Scraper

import os
import re
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

driver_path = r"C:\path\to\chromedriver.exe"
csv_file_path = r"C:\path\to\snow_report_data_onthesnow.csv"
log_file_path = r"C:\path\to\scraper_log.txt"

logging.basicConfig(filename=log_file_path, level=logging.INFO)

service = Service(driver_path)
driver = webdriver.Chrome(service=service)
driver.get("https://www.onthesnow.com/colorado/skireport")

WebDriverWait(driver, 40).until(
    EC.presence_of_element_located((By.CLASS_NAME, "styles_row__HA9Yq"))
)

html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
driver.quit()

snow_data = []
resorts = soup.find_all("tr", class_="styles_row__HA9Yq")

for resort in resorts:
    data_tags = resort.find_all("span", class_="h4 styles_h4__x3zzi")
    # ... parse resort name, snowfall, base depth, open trails, etc. ...

new_data_df = pd.DataFrame(snow_data)

# Example of splitting base depth, computing % trails open, etc.
# ...
new_data_df.to_csv(csv_file_path, index=False)
logging.info("Data saved.")
print(new_data_df)

GitHub Repository

View on GitHub