Spaces:

Mihkelmj
/

utrecht-pollution-prediction

Sleeping

File size: 7,091 Bytes

import codecs
import csv
import http.client
import os
import re
import sys
import urllib.request
from datetime import date, timedelta
from io import StringIO

import pandas as pd

WEATHER_DATA_FILE = "weather_data.csv"
POLLUTION_DATA_FILE = "pollution_data.csv"


def update_weather_data():
    today = date.today().isoformat()

    if os.path.exists(WEATHER_DATA_FILE):
        df = pd.read_csv(WEATHER_DATA_FILE)
        last_date = pd.to_datetime(df["date"]).max()
        start_date = (last_date + timedelta(1)).isoformat()
    else:
        df = pd.DataFrame()
        start_date = (date.today() - timedelta(7)).isoformat()

    try:
        ResultBytes = urllib.request.urlopen(
            f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{today}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
        )
        CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))

        new_data = pd.DataFrame(list(CSVText))
        new_data.columns = new_data.iloc[0]
        new_data = new_data[1:]
        new_data = new_data.rename(columns={"datetime": "date"})

        updated_df = pd.concat([df, new_data], ignore_index=True)
        updated_df.drop_duplicates(subset="date", keep="last", inplace=True)
        updated_df.to_csv(WEATHER_DATA_FILE, index=False)

    except urllib.error.HTTPError as e:
        ErrorInfo = e.read().decode()
        print("Error code: ", e.code, ErrorInfo)
        sys.exit()
    except urllib.error.URLError as e:
        ErrorInfo = e.read().decode()
        print("Error code: ", e.code, ErrorInfo)
        sys.exit()


def update_pollution_data():
    O3 = []
    NO2 = []
    particles = ["NO2", "O3"]
    stations = ["NL10636", "NL10639", "NL10643"]
    all_dataframes = []
    today = date.today().isoformat() + "T09:00:00Z"
    yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
    latest_date = (date.today() - timedelta(8)).isoformat() + "T09:00:00Z"
    days_today = 0
    days_yesterday = 1
    while today != latest_date:
        days_today += 1
        days_yesterday += 1
        for particle in particles:
            for station in stations:
                conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
                payload = ""
                headers = {}
                conn.request(
                    "GET",
                    f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
                    payload,
                    headers,
                )
                res = conn.getresponse()
                data = res.read()
                decoded_data = data.decode("utf-8")
                df = pd.read_csv(StringIO(decoded_data))
                df = df.filter(like="value")
                all_dataframes.append(df)
            combined_data = pd.concat(all_dataframes, ignore_index=True)
            values = []
            for row in combined_data:
                cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", row)
                if cleaned_value:  # If we successfully extract a number
                    values.append(
                        float(cleaned_value[0])
                    )  # Convert the first match to float

            # Compute the average if the values list is not empty
            if values:
                avg = sum(values) / len(values)
                if particle == "NO2":
                    NO2.append(avg)
                else:
                    O3.append(avg)
        today = (date.today() - timedelta(days_today)).isoformat() + "T09:00:00Z"
        yesterday = (
            date.today() - timedelta(days_yesterday)
        ).isoformat() + "T09:00:00Z"

    avg_combined_data = pd.DataFrame(
        {
            "date": pd.date_range(end=date.today(), periods=len(NO2)),
            "NO2": NO2,
            "O3": O3,
        }
    )

    avg_combined_data = reverse_pollution(NO2, O3, avg_combined_data)

    if os.path.exists(POLLUTION_DATA_FILE):
        existing_data = pd.read_csv(POLLUTION_DATA_FILE)
        last_date = pd.to_datetime(existing_data["date"]).max()
        new_data = avg_combined_data[avg_combined_data["date"] > last_date]
        updated_data = pd.concat([existing_data, new_data], ignore_index=True)
        updated_data.drop_duplicates(subset="date", keep="last", inplace=True)
    else:
        updated_data = avg_combined_data

    updated_data.to_csv(POLLUTION_DATA_FILE, index=False)


def reverse_pollution(NO2, O3, data):
    df = data
    start_index = 0
    while NO2:
        df.loc[start_index, "NO2"] = NO2.pop()
        start_index += 1
    start_index = 0
    while O3:
        df.loc[start_index, "O3"] = O3.pop()
        start_index += 1
    return df


def get_combined_data():
    update_weather_data()
    update_pollution_data()

    weather_df = pd.read_csv(WEATHER_DATA_FILE)

    weather_df.insert(1, "NO2", None)
    weather_df.insert(2, "O3", None)
    weather_df.insert(10, "weekday", None)
    columns = list(weather_df.columns)
    columns.insert(3, columns.pop(6))
    weather_df = weather_df[columns]
    columns.insert(5, columns.pop(9))
    weather_df = weather_df[columns]
    columns.insert(9, columns.pop(6))
    weather_df = weather_df[columns]
    
    combined_df = weather_df

    # Apply scaling and renaming similar to the scale function from previous code
    combined_df = combined_df.rename(
        columns={
            "date": "date",
            "windspeed": "wind_speed",
            "temp": "mean_temp",
            "solarradiation": "global_radiation",
            "precip": "percipitation",
            "sealevelpressure": "pressure",
            "visibility": "minimum_visibility",
        }
    )

    combined_df["date"] = pd.to_datetime(combined_df["date"])
    combined_df["weekday"] = combined_df["date"].dt.day_name()

    combined_df["wind_speed"] = (combined_df["wind_speed"] / 3.6) * 10
    combined_df["mean_temp"] = combined_df["mean_temp"] * 10
    combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
    combined_df["percipitation"] = combined_df["percipitation"] * 10
    combined_df["pressure"] = combined_df["pressure"] * 10

    combined_df["wind_speed"] = combined_df["wind_speed"].astype(int)
    combined_df["mean_temp"] = combined_df["mean_temp"].astype(int)
    combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(int)
    combined_df["percipitation"] = combined_df["percipitation"].astype(int)
    combined_df["pressure"] = combined_df["pressure"].astype(int)
    combined_df["humidity"] = combined_df["humidity"].astype(int)
    combined_df["global_radiation"] = combined_df["global_radiation"].astype(int)
    
    pollution_df = pd.read_csv(POLLUTION_DATA_FILE)
    combined_df["NO2"] = pollution_df["NO2"]
    combined_df["O3"] = pollution_df["O3"]

    return combined_df