Spaces:

Mihkelmj
/

utrecht-pollution-prediction

Sleeping

App Files Files Community

elisaklunder commited on Oct 24, 2024

Commit

f4930a4

1 Parent(s): 3dd6a8c

data finally working

Browse files

Files changed (8) hide show

app.py +0 -1
past_pollution_data.csv +12 -0
past_weather_data.csv +12 -0
src/data_api_calls.py +47 -71
src/past_data_api_calls copy.py +0 -199
src/past_data_api_calls.py +72 -22
src/predict.py +10 -5
test.ipynb +94 -512

app.py CHANGED Viewed

@@ -35,7 +35,6 @@ no2_values = pd.concat([no2_past_values, no2_future_values], ignore_index=True)
 dates = dates_past + dates_future
 df = pd.DataFrame({"Date": dates, "O3": o3_values, "NO2": no2_values})
 # App Title
 st.title("Utrecht Pollution Dashboard🌱")

 dates = dates_past + dates_future
 df = pd.DataFrame({"Date": dates, "O3": o3_values, "NO2": no2_values})
 # App Title
 st.title("Utrecht Pollution Dashboard🌱")

past_pollution_data.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+date,NO2,O3
+2023-10-18,10.842702702702699,39.81260000000001
+2023-10-19,17.97026666666666,31.779024390243908
+2023-10-20,17.233055555555563,18.7156
+2023-10-21,15.023599999999993,22.04
+2023-10-22,8.723378378378372,48.33439999999999
+2023-10-23,20.634266666666676,15.586000000000002
+2023-10-24,15.115599999999999,24.628085106382972
+2023-10-25,22.885675675675678,27.117599999999992
+2023-10-26,21.531756756756756,13.3216
+2023-10-27,23.07226666666666,16.15416666666666
+2023-10-28,24.89121621621622,24.59040816326531

past_weather_data.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+date,temp,humidity,precip,windspeed,sealevelpressure,visibility,solarradiation
+2023-10-17,8.5,84.8,0.0,22.3,1019.3,34.8,75.2
+2023-10-18,9.0,77.9,2.3,25.9,1006.0,23.8,71.2
+2023-10-19,14.5,94.0,11.4,22.3,990.8,21.2,39.8
+2023-10-20,11.9,97.4,20.4,25.9,981.0,10.4,7.0
+2023-10-21,13.1,88.0,3.5,22.3,989.4,27.7,39.9
+2023-10-22,12.1,87.3,3.9,25.9,1003.6,32.3,55.9
+2023-10-23,9.9,95.7,0.5,18.0,1011.1,5.9,43.8
+2023-10-24,11.6,92.3,6.5,22.3,1001.3,23.1,32.6
+2023-10-25,9.3,96.8,15.3,18.0,996.8,15.7,14.5
+2023-10-26,9.4,97.6,0.1,11.2,995.6,4.8,36.0
+2023-10-27,10.6,97.9,11.4,14.8,992,9.5,20.5

src/data_api_calls.py CHANGED Viewed

@@ -58,85 +58,61 @@ def update_pollution_data():
     all_dataframes = []
     today = date.today().isoformat() + "T09:00:00Z"
     yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
-    latest_date = (date.today() - timedelta(8)).isoformat() + "T09:00:00Z"
-    days_today = 0
-    days_yesterday = 1
-    while today != latest_date:
-        days_today += 1
-        days_yesterday += 1
-        for particle in particles:
-            for station in stations:
-                conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
-                payload = ""
-                headers = {}
-                conn.request(
-                    "GET",
-                    f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
-                    payload,
-                    headers,
-                )
-                res = conn.getresponse()
-                data = res.read()
-                decoded_data = data.decode("utf-8")
-                df = pd.read_csv(StringIO(decoded_data))
-                df = df.filter(like="value")
-                all_dataframes.append(df)
-            combined_data = pd.concat(all_dataframes, ignore_index=True)
-            values = []
-            for row in combined_data:
-                cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", row)
-                if cleaned_value:  # If we successfully extract a number
-                    values.append(
-                        float(cleaned_value[0])
-                    )  # Convert the first match to float
-            # Compute the average if the values list is not empty
-            if values:
-                avg = sum(values) / len(values)
-                if particle == "NO2":
-                    NO2.append(avg)
-                else:
-                    O3.append(avg)
-        today = (date.today() - timedelta(days_today)).isoformat() + "T09:00:00Z"
-        yesterday = (
-            date.today() - timedelta(days_yesterday)
-        ).isoformat() + "T09:00:00Z"
-    avg_combined_data = pd.DataFrame(
         {
-            "date": pd.date_range(end=date.today(), periods=len(NO2)),
             "NO2": NO2,
             "O3": O3,
         }
     )
-    avg_combined_data = reverse_pollution(NO2, O3, avg_combined_data)
-    if os.path.exists(POLLUTION_DATA_FILE):
-        existing_data = pd.read_csv(POLLUTION_DATA_FILE)
-        last_date = pd.to_datetime(existing_data["date"]).max()
-        new_data = avg_combined_data[avg_combined_data["date"] > last_date]
-        updated_data = pd.concat([existing_data, new_data], ignore_index=True)
-        updated_data.drop_duplicates(subset="date", keep="last", inplace=True)
-    else:
-        updated_data = avg_combined_data
     updated_data.to_csv(POLLUTION_DATA_FILE, index=False)
-def reverse_pollution(NO2, O3, data):
-    df = data
-    start_index = 0
-    while NO2:
-        df.loc[start_index, "NO2"] = NO2.pop()
-        start_index += 1
-    start_index = 0
-    while O3:
-        df.loc[start_index, "O3"] = O3.pop()
-        start_index += 1
-    return df
 def get_combined_data():
     update_weather_data()
     update_pollution_data()
@@ -153,7 +129,7 @@ def get_combined_data():
     weather_df = weather_df[columns]
     columns.insert(9, columns.pop(6))
     weather_df = weather_df[columns]
     combined_df = weather_df
     # Apply scaling and renaming similar to the scale function from previous code
@@ -185,7 +161,7 @@ def get_combined_data():
     combined_df["pressure"] = combined_df["pressure"].astype(int)
     combined_df["humidity"] = combined_df["humidity"].astype(int)
     combined_df["global_radiation"] = combined_df["global_radiation"].astype(int)
     pollution_df = pd.read_csv(POLLUTION_DATA_FILE)
     combined_df["NO2"] = pollution_df["NO2"]
     combined_df["O3"] = pollution_df["O3"]

     all_dataframes = []
     today = date.today().isoformat() + "T09:00:00Z"
     yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
+    if os.path.exists(POLLUTION_DATA_FILE):
+        existing_data = pd.read_csv(POLLUTION_DATA_FILE)
+        last_date = pd.to_datetime(existing_data["date"]).max()
+        if last_date >= pd.Timestamp(date.today()):
+            print("Data is already up to date.")
+            return
+    # Only pull data for today if not already updated
+    for particle in particles:
+        for station in stations:
+            conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
+            payload = ""
+            headers = {}
+            conn.request(
+                "GET",
+                f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
+                payload,
+                headers,
+            )
+            res = conn.getresponse()
+            data = res.read()
+            decoded_data = data.decode("utf-8")
+            df = pd.read_csv(StringIO(decoded_data))
+            df = df.filter(like="value")
+            all_dataframes.append(df)
+        combined_data = pd.concat(all_dataframes, ignore_index=True)
+        values = []
+        for row in combined_data:
+            cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", row)
+            if cleaned_value:
+                values.append(float(cleaned_value[0]))
+        if values:
+            avg = sum(values) / len(values)
+            if particle == "NO2":
+                NO2.append(avg)
+            else:
+                O3.append(avg)
+    new_data = pd.DataFrame(
         {
+            "date": [date.today()],
             "NO2": NO2,
             "O3": O3,
         }
     )
+    updated_data = pd.concat([existing_data, new_data], ignore_index=True)
+    updated_data.drop_duplicates(subset="date", keep="last", inplace=True)
     updated_data.to_csv(POLLUTION_DATA_FILE, index=False)
 def get_combined_data():
     update_weather_data()
     update_pollution_data()
     weather_df = weather_df[columns]
     columns.insert(9, columns.pop(6))
     weather_df = weather_df[columns]
     combined_df = weather_df
     # Apply scaling and renaming similar to the scale function from previous code
     combined_df["pressure"] = combined_df["pressure"].astype(int)
     combined_df["humidity"] = combined_df["humidity"].astype(int)
     combined_df["global_radiation"] = combined_df["global_radiation"].astype(int)
     pollution_df = pd.read_csv(POLLUTION_DATA_FILE)
     combined_df["NO2"] = pollution_df["NO2"]
     combined_df["O3"] = pollution_df["O3"]

src/past_data_api_calls copy.py DELETED Viewed

@@ -1,199 +0,0 @@
-import codecs
-import csv
-import http.client
-import os
-import re
-import sys
-import urllib.request
-from datetime import date, timedelta
-from io import StringIO
-import pandas as pd
-def pollution_data():
-    particles = ["NO2", "O3"]
-    stations = ["NL10636", "NL10639", "NL10643"]
-    last_year_date = date.today() - timedelta(days=365)
-    start_date = last_year_date - timedelta(days=7)
-    end_date = last_year_date + timedelta(days=3)
-    date_list = [
-        start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)
-    ]
-    for current_date in date_list:
-        today = current_date.isoformat() + "T09:00:00Z"
-        yesterday = (current_date - timedelta(1)).isoformat() + "T09:00:00Z"
-        for particle in particles:
-            all_dataframes = []  # Reset for each particle
-            for station in stations:
-                conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
-                payload = ""
-                headers = {}
-                conn.request(
-                    "GET",
-                    f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
-                    payload,
-                    headers,
-                )
-                res = conn.getresponse()
-                data = res.read()
-                decoded_data = data.decode("utf-8")
-                df = pd.read_csv(StringIO(decoded_data))
-                df = df.filter(like="value")
-                all_dataframes.append(df)
-            if all_dataframes:
-                combined_data = pd.concat(all_dataframes, ignore_index=True)
-                combined_data.to_csv(f"{particle}_{today}.csv", index=False)
-def delete_csv(csvs):
-    for csv_file in csvs:
-        if os.path.exists(csv_file) and os.path.isfile(csv_file):
-            os.remove(csv_file)
-def clean_values():
-    particles = ["NO2", "O3"]
-    csvs = []
-    NO2 = []
-    O3 = []
-    last_year_date = date.today() - timedelta(days=365)
-    start_date = last_year_date - timedelta(days=7)
-    end_date = last_year_date + timedelta(days=3)
-    date_list = [
-        start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)
-    ]
-    for current_date in date_list:
-        today = current_date.isoformat() + "T09:00:00Z"
-        for particle in particles:
-            name = f"{particle}_{today}.csv"
-            csvs.append(name)
-    for csv_file in csvs:
-        if not os.path.exists(csv_file):
-            continue  # Skip if the file doesn't exist
-        values = []  # Reset values for each CSV file
-        # Open the CSV file and read the values
-        with open(csv_file, "r") as file:
-            reader = csv.reader(file)
-            for row in reader:
-                for value in row:
-                    # Use regular expressions to extract numeric part
-                    cleaned_value = re.findall(r"[-+]?\d*\.\d+|\d+", value)
-                    if cleaned_value:  # If we successfully extract a number
-                        values.append(
-                            float(cleaned_value[0])
-                        )  # Convert the first match to float
-        # Compute the average if the values list is not empty
-        if values:
-            avg = sum(values) / len(values)
-            if "NO2" in csv_file:
-                NO2.append(avg)
-            else:
-                O3.append(avg)
-    delete_csv(csvs)
-    return NO2, O3
-def add_columns():
-    file_path = "weather_data.csv"
-    df = pd.read_csv(file_path)
-    df.insert(1, "NO2", None)
-    df.insert(2, "O3", None)
-    df.insert(10, "weekday", None)
-    return df
-def scale(data):
-    df = data
-    columns = list(df.columns)
-    columns.insert(3, columns.pop(6))
-    df = df[columns]
-    columns.insert(5, columns.pop(9))
-    df = df[columns]
-    columns.insert(9, columns.pop(6))
-    df = df[columns]
-    df = df.rename(
-        columns={
-            "datetime": "date",
-            "windspeed": "wind_speed",
-            "temp": "mean_temp",
-            "solarradiation": "global_radiation",
-            "precip": "percipitation",
-            "sealevelpressure": "pressure",
-            "visibility": "minimum_visibility",
-        }
-    )
-    df["date"] = pd.to_datetime(df["date"])
-    df["weekday"] = df["date"].dt.day_name()
-    df = df.sort_values(by="date").reset_index(drop=True)
-    df["wind_speed"] = (df["wind_speed"] / 3.6) * 10
-    df["mean_temp"] = df["mean_temp"] * 10
-    df["minimum_visibility"] = df["minimum_visibility"] * 10
-    df["percipitation"] = df["percipitation"] * 10
-    df["pressure"] = df["pressure"]
-    df["wind_speed"] = df["wind_speed"].astype(int)
-    df["mean_temp"] = df["mean_temp"].astype(int)
-    df["minimum_visibility"] = df["minimum_visibility"].astype(int)
-    df["percipitation"] = df["percipitation"].astype(int)
-    df["pressure"] = df["pressure"].astype(int)
-    df["humidity"] = df["humidity"].astype(int)
-    df["global_radiation"] = df["global_radiation"].astype(int)
-    return df
-def insert_pollution(NO2, O3, data):
-    df = data
-    df["NO2"] = NO2
-    df["O3"] = O3
-    return df
-def weather_data():
-    last_year_date = date.today() - timedelta(days=365)
-    start_date = (last_year_date - timedelta(days=7)).isoformat()
-    end_date = (last_year_date + timedelta(days=3)).isoformat()
-    try:
-        ResultBytes = urllib.request.urlopen(
-            f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{end_date}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
-        )
-        # Parse the results as CSV
-        CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))
-        # Saving the CSV content to a file
-        current_dir = os.path.dirname(os.path.realpath(__file__))
-        file_path = os.path.join(current_dir, "past_weather_data.csv")
-        with open(file_path, "w", newline="", encoding="utf-8") as csvfile:
-            csv_writer = csv.writer(csvfile)
-            csv_writer.writerows(CSVText)
-    except urllib.error.HTTPError as e:
-        ErrorInfo = e.read().decode()
-        print("Error code: ", e.code, ErrorInfo)
-        sys.exit()
-    except urllib.error.URLError as e:
-        ErrorInfo = e.read().decode()
-        print("Error code: ", e.code, ErrorInfo)
-        sys.exit()
-def get_past_data():
-    weather_data()
-    pollution_data()
-    NO2, O3 = clean_values()
-    df = add_columns()
-    scaled_df = scale(df)
-    output_df = insert_pollution(NO2, O3, scaled_df)
-    os.remove("past_weather_data.csv")
-    return output_df

src/past_data_api_calls.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import codecs
 import csv
 import http.client
 import re
 import sys
 import urllib.request
@@ -9,14 +10,21 @@ from io import StringIO
 import pandas as pd
-PAST_WEATHER_DATA_FILE = "weather_data.csv"
-PAST_POLLUTION_DATA_FILE = "pollution_data.csv"
-def get_past_weather_data():
     last_year_date = date.today() - timedelta(days=365)
-    start_date = (last_year_date - timedelta(days=8)).isoformat()
-    end_date = (last_year_date + timedelta(days=2)).isoformat()
     try:
         ResultBytes = urllib.request.urlopen(
@@ -28,7 +36,10 @@ def get_past_weather_data():
         data.columns = data.iloc[0]
         data = data[1:]
         data = data.rename(columns={"datetime": "date"})
-        return data
     except urllib.error.HTTPError as e:
         ErrorInfo = e.read().decode()
@@ -40,15 +51,29 @@ def get_past_weather_data():
         sys.exit()
-def get_past_pollution_data():
     O3 = []
     NO2 = []
     particles = ["NO2", "O3"]
     stations = ["NL10636", "NL10639", "NL10643"]
     all_dataframes = []
     last_year_date = date.today() - timedelta(days=365)
-    start_date = last_year_date - timedelta(days=7)
-    end_date = last_year_date + timedelta(days=3)
     date_list = [
         start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)
     ]
@@ -88,16 +113,31 @@ def get_past_pollution_data():
                 else:
                     O3.append(avg)
     return NO2, O3
 def get_past_combined_data():
-    weather_df = get_past_weather_data()
-    NO2_df, O3_df = get_past_pollution_data()
-    combined_df = weather_df
-    combined_df["NO2"] = NO2_df
-    combined_df["O3"] = O3_df
     # Apply scaling and renaming similar to the scale function from previous code
     combined_df = combined_df.rename(
@@ -114,7 +154,7 @@ def get_past_combined_data():
     combined_df["date"] = pd.to_datetime(combined_df["date"])
     combined_df["weekday"] = combined_df["date"].dt.day_name()
     combined_df["wind_speed"] = combined_df["wind_speed"].astype(float)
     combined_df["mean_temp"] = combined_df["mean_temp"].astype(float)
     combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(float)
@@ -128,13 +168,23 @@ def get_past_combined_data():
     combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
     combined_df["percipitation"] = combined_df["percipitation"] * 10
     combined_df["pressure"] = combined_df["pressure"] * 10
-    combined_df["wind_speed"] = combined_df["wind_speed"].astype(float).round().astype(int)
-    combined_df["mean_temp"] = combined_df["mean_temp"].astype(float).round().astype(int)
-    combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(float).round().astype(int)
-    combined_df["percipitation"] = combined_df["percipitation"].astype(float).round().astype(int)
     combined_df["pressure"] = combined_df["pressure"].astype(float).round().astype(int)
     combined_df["humidity"] = combined_df["humidity"].astype(float).round().astype(int)
-    combined_df["global_radiation"] = combined_df["global_radiation"].astype(float).round().astype(int)
     return combined_df

 import codecs
 import csv
 import http.client
+import os
 import re
 import sys
 import urllib.request
 import pandas as pd
+PAST_WEATHER_DATA_FILE = "past_weather_data.csv"
+PAST_POLLUTION_DATA_FILE = "past_pollution_data.csv"
+def update_past_weather_data():
     last_year_date = date.today() - timedelta(days=365)
+    if os.path.exists(PAST_WEATHER_DATA_FILE):
+        df = pd.read_csv(PAST_WEATHER_DATA_FILE)
+        start_date = pd.to_datetime(df["date"]).max().date().isoformat()
+        end_date = (last_year_date + timedelta(days=2)).isoformat()
+    else:
+        df = pd.DataFrame()
+        start_date = (last_year_date - timedelta(days=8)).isoformat()
+        end_date = (last_year_date + timedelta(days=2)).isoformat()
     try:
         ResultBytes = urllib.request.urlopen(
         data.columns = data.iloc[0]
         data = data[1:]
         data = data.rename(columns={"datetime": "date"})
+        updated_df = pd.concat([df, data], ignore_index=True)
+        updated_df.drop_duplicates(subset="date", keep="last", inplace=True)
+        updated_df.to_csv(PAST_WEATHER_DATA_FILE, index=False)
     except urllib.error.HTTPError as e:
         ErrorInfo = e.read().decode()
         sys.exit()
+def update_past_pollution_data():
     O3 = []
     NO2 = []
     particles = ["NO2", "O3"]
     stations = ["NL10636", "NL10639", "NL10643"]
     all_dataframes = []
     last_year_date = date.today() - timedelta(days=365)
+    if os.path.exists(PAST_POLLUTION_DATA_FILE):
+        existing_data = pd.read_csv(PAST_POLLUTION_DATA_FILE)
+        last_date = pd.to_datetime(existing_data["date"]).max()
+        if last_date >= pd.to_datetime(last_year_date):
+            print("Data is already up to date.")
+            return
+        else:
+            start_date = last_date.date()
+            end_date = last_year_date + timedelta(days=3)
+    else:
+        existing_data = pd.DataFrame()
+        start_date = last_year_date - timedelta(days=7)
+        end_date = last_year_date + timedelta(days=3)
     date_list = [
         start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)
     ]
                 else:
                     O3.append(avg)
+    new_data = pd.DataFrame(
+        {
+            "date": date_list,
+            "NO2": NO2,
+            "O3": O3,
+        }
+    )
+    updated_data = pd.concat([existing_data, new_data], ignore_index=True)
+    updated_data.drop_duplicates(subset="date", keep="last", inplace=True)
+    updated_data.to_csv(PAST_POLLUTION_DATA_FILE, index=False)
     return NO2, O3
 def get_past_combined_data():
+    update_past_weather_data()
+    update_past_pollution_data()
+    combined_df = pd.read_csv(PAST_WEATHER_DATA_FILE)
+    pollution_data = pd.read_csv(PAST_POLLUTION_DATA_FILE)
+    combined_df["NO2"] = pollution_data["NO2"]
+    combined_df["O3"] = pollution_data["O3"]
     # Apply scaling and renaming similar to the scale function from previous code
     combined_df = combined_df.rename(
     combined_df["date"] = pd.to_datetime(combined_df["date"])
     combined_df["weekday"] = combined_df["date"].dt.day_name()
     combined_df["wind_speed"] = combined_df["wind_speed"].astype(float)
     combined_df["mean_temp"] = combined_df["mean_temp"].astype(float)
     combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(float)
     combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
     combined_df["percipitation"] = combined_df["percipitation"] * 10
     combined_df["pressure"] = combined_df["pressure"] * 10
+    combined_df["wind_speed"] = (
+        combined_df["wind_speed"].astype(float).round().astype(int)
+    )
+    combined_df["mean_temp"] = (
+        combined_df["mean_temp"].astype(float).round().astype(int)
+    )
+    combined_df["minimum_visibility"] = (
+        combined_df["minimum_visibility"].astype(float).round().astype(int)
+    )
+    combined_df["percipitation"] = (
+        combined_df["percipitation"].astype(float).round().astype(int)
+    )
     combined_df["pressure"] = combined_df["pressure"].astype(float).round().astype(int)
     combined_df["humidity"] = combined_df["humidity"].astype(float).round().astype(int)
+    combined_df["global_radiation"] = (
+        combined_df["global_radiation"].astype(float).round().astype(int)
+    )
     return combined_df

src/predict.py CHANGED Viewed

@@ -17,7 +17,7 @@ def load_model(particle):
     if particle == "O3":
         file_name = "O3_svr_model.pkl"
     elif particle == "NO2":
-        file_name == "NO2_nn_model.pkl"
     model_path = hf_hub_download(repo_id=repo_id, filename=file_name)
     model = joblib.load(model_path)
@@ -48,7 +48,7 @@ def get_data_and_predictions():
                 "pollutant": "O3",
                 "date_predicted": date.today(),
                 "date": date.today() + timedelta(days=i + 1),
-                "prediction_value": o3_predictions[i],
             }
         )
         prediction_data.append(
@@ -56,15 +56,20 @@ def get_data_and_predictions():
                 "pollutant": "NO2",
                 "date_predicted": date.today(),
                 "date": date.today() + timedelta(days=i + 1),
-                "prediction_value": no2_predictions[i],
             }
         )
     predictions_df = pd.DataFrame(prediction_data)
     if os.path.exists(PREDICTIONS_FILE):
-        predictions_df.to_csv(PREDICTIONS_FILE, mode="a", header=False, index=False)
     else:
-        predictions_df.to_csv(PREDICTIONS_FILE, mode="w", header=True, index=False)
     return week_data, o3_predictions, no2_predictions

     if particle == "O3":
         file_name = "O3_svr_model.pkl"
     elif particle == "NO2":
+        file_name = "NO2_svr_model.pkl"
     model_path = hf_hub_download(repo_id=repo_id, filename=file_name)
     model = joblib.load(model_path)
                 "pollutant": "O3",
                 "date_predicted": date.today(),
                 "date": date.today() + timedelta(days=i + 1),
+                "prediction_value": o3_predictions[0][i],
             }
         )
         prediction_data.append(
                 "pollutant": "NO2",
                 "date_predicted": date.today(),
                 "date": date.today() + timedelta(days=i + 1),
+                "prediction_value": no2_predictions[0][i],
             }
         )
     predictions_df = pd.DataFrame(prediction_data)
     if os.path.exists(PREDICTIONS_FILE):
+        existing_data = pd.read_csv(PREDICTIONS_FILE)
+        combined_data = pd.concat([existing_data, predictions_df])
+        combined_data = combined_data.drop_duplicates(
+            subset=["pollutant", "date_predicted", "date"], keep="first"
+        )
     else:
+        combined_data = predictions_df
+    combined_data.to_csv(PREDICTIONS_FILE, index=False)
     return week_data, o3_predictions, no2_predictions

test.ipynb CHANGED Viewed

@@ -15,7 +15,9 @@
     }
    ],
    "source": [
-    "from src.predict import get_data_and_predictions"
    ]
   },
   {
@@ -24,22 +26,14 @@
    "metadata": {},
    "outputs": [
     {
-     "ename": "ValueError",
-     "evalue": "Length of values (0) does not match length of index (11)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m week_data, predictions_O3, predictions_NO2 \u001b[38;5;241m=\u001b[39m \u001b[43mget_data_and_predictions\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\src\\predict.py:41\u001b[0m, in \u001b[0;36mget_data_and_predictions\u001b[1;34m()\u001b[0m\n\u001b[0;32m     37\u001b[0m PREDICTIONS_FILE \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpredictions_history.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m     39\u001b[0m week_data \u001b[38;5;241m=\u001b[39m get_combined_data()\n\u001b[1;32m---> 41\u001b[0m o3_predictions \u001b[38;5;241m=\u001b[39m \u001b[43mrun_model\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mO3\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mweek_data\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     42\u001b[0m no2_predictions \u001b[38;5;241m=\u001b[39m run_model(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNO2\u001b[39m\u001b[38;5;124m\"\u001b[39m, data\u001b[38;5;241m=\u001b[39mweek_data)\n\u001b[0;32m     44\u001b[0m prediction_data \u001b[38;5;241m=\u001b[39m []\n",
-      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\src\\predict.py:28\u001b[0m, in \u001b[0;36mrun_model\u001b[1;34m(particle, data)\u001b[0m\n\u001b[0;32m     27\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrun_model\u001b[39m(particle, data):\n\u001b[1;32m---> 28\u001b[0m     input_data \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_particle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparticle\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     29\u001b[0m     model \u001b[38;5;241m=\u001b[39m load_model(particle)\n\u001b[0;32m     30\u001b[0m     prediction \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mpredict(input_data)\n",
-      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\src\\features_pipeline.py:60\u001b[0m, in \u001b[0;36mcreate_features\u001b[1;34m(data, target_particle, lag_days, sma_days)\u001b[0m\n\u001b[0;32m     55\u001b[0m     data[\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfeature\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_sma_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msma_days\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m     56\u001b[0m         data[feature]\u001b[38;5;241m.\u001b[39mrolling(window\u001b[38;5;241m=\u001b[39msma_days)\u001b[38;5;241m.\u001b[39mmean()\n\u001b[0;32m     57\u001b[0m     )\n\u001b[0;32m     59\u001b[0m \u001b[38;5;66;03m# Create particle data (NO2 and O3) from the same time last year\u001b[39;00m\n\u001b[1;32m---> 60\u001b[0m past_data \u001b[38;5;241m=\u001b[39m \u001b[43mget_past_combined_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     62\u001b[0m \u001b[38;5;66;03m# Today last year\u001b[39;00m\n\u001b[0;32m     63\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO3_last_year\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m past_data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO3\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39miloc[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m4\u001b[39m]\n",
-      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\src\\past_data_api_calls.py:99\u001b[0m, in \u001b[0;36mget_past_combined_data\u001b[1;34m()\u001b[0m\n\u001b[0;32m     96\u001b[0m NO2_df, O3_df \u001b[38;5;241m=\u001b[39m get_past_pollution_data()\n\u001b[0;32m     98\u001b[0m combined_df \u001b[38;5;241m=\u001b[39m weather_df\n\u001b[1;32m---> 99\u001b[0m \u001b[43mcombined_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mNO2\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m=\u001b[39m NO2_df\n\u001b[0;32m    100\u001b[0m combined_df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO3\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m O3_df\n\u001b[0;32m    102\u001b[0m \u001b[38;5;66;03m# Apply scaling and renaming similar to the scale function from previous code\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\core\\frame.py:4311\u001b[0m, in \u001b[0;36mDataFrame.__setitem__\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m   4308\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_setitem_array([key], value)\n\u001b[0;32m   4309\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m   4310\u001b[0m     \u001b[38;5;66;03m# set column\u001b[39;00m\n\u001b[1;32m-> 4311\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_set_item\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\core\\frame.py:4524\u001b[0m, in \u001b[0;36mDataFrame._set_item\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m   4514\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_set_item\u001b[39m(\u001b[38;5;28mself\u001b[39m, key, value) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m   4515\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   4516\u001b[0m \u001b[38;5;124;03m    Add series to DataFrame in specified column.\u001b[39;00m\n\u001b[0;32m   4517\u001b[0m \n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   4522\u001b[0m \u001b[38;5;124;03m    ensure homogeneity.\u001b[39;00m\n\u001b[0;32m   4523\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 4524\u001b[0m     value, refs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sanitize_column\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   4526\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m   4527\u001b[0m         key \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\n\u001b[0;32m   4528\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m value\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m   4529\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(value\u001b[38;5;241m.\u001b[39mdtype, ExtensionDtype)\n\u001b[0;32m   4530\u001b[0m     ):\n\u001b[0;32m   4531\u001b[0m         \u001b[38;5;66;03m# broadcast across multiple columns if necessary\u001b[39;00m\n\u001b[0;32m   4532\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mis_unique \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns, MultiIndex):\n",
-      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\core\\frame.py:5266\u001b[0m, in \u001b[0;36mDataFrame._sanitize_column\u001b[1;34m(self, value)\u001b[0m\n\u001b[0;32m   5263\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m _reindex_for_setitem(value, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex)\n\u001b[0;32m   5265\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_list_like(value):\n\u001b[1;32m-> 5266\u001b[0m     \u001b[43mcom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequire_length_match\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   5267\u001b[0m arr \u001b[38;5;241m=\u001b[39m sanitize_array(value, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, allow_2d\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m   5268\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m   5269\u001b[0m     \u001b[38;5;28misinstance\u001b[39m(value, Index)\n\u001b[0;32m   5270\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m value\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mobject\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   5273\u001b[0m     \u001b[38;5;66;03m# TODO: Remove kludge in sanitize_array for string mode when enforcing\u001b[39;00m\n\u001b[0;32m   5274\u001b[0m     \u001b[38;5;66;03m# this deprecation\u001b[39;00m\n",
-      "File \u001b[1;32mc:\\Users\\elikl\\Documents\\Uni\\yr3\\ML for industry\\utrecht-pollution-prediction\\.venv\\Lib\\site-packages\\pandas\\core\\common.py:573\u001b[0m, in \u001b[0;36mrequire_length_match\u001b[1;34m(data, index)\u001b[0m\n\u001b[0;32m    569\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m    570\u001b[0m \u001b[38;5;124;03mCheck the length of data matches the length of the index.\u001b[39;00m\n\u001b[0;32m    571\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m    572\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(data) \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(index):\n\u001b[1;32m--> 573\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m    574\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLength of values \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    575\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(data)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    576\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdoes not match length of index \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    577\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(index)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    578\u001b[0m     )\n",
-      "\u001b[1;31mValueError\u001b[0m: Length of values (0) does not match length of index (11)"
      ]
     }
    ],
@@ -47,29 +41,10 @@
     "week_data, predictions_O3, predictions_NO2 = get_data_and_predictions()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "week_data"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = pd.read_csv(\"dataset.csv\")\n",
-    "target_particle = \"O3\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -108,23 +83,9 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>2024-10-16</td>\n",
-       "      <td>22.602712</td>\n",
-       "      <td>22.881288</td>\n",
-       "      <td>61</td>\n",
-       "      <td>151</td>\n",
-       "      <td>40</td>\n",
-       "      <td>0</td>\n",
-       "      <td>10103</td>\n",
-       "      <td>358</td>\n",
-       "      <td>82</td>\n",
-       "      <td>Wednesday</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
        "      <td>2024-10-17</td>\n",
-       "      <td>23.104327</td>\n",
-       "      <td>23.038638</td>\n",
        "      <td>51</td>\n",
        "      <td>169</td>\n",
        "      <td>43</td>\n",
@@ -135,52 +96,52 @@
        "      <td>Thursday</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
        "      <td>2024-10-18</td>\n",
-       "      <td>23.682857</td>\n",
-       "      <td>23.716611</td>\n",
        "      <td>21</td>\n",
-       "      <td>156</td>\n",
        "      <td>42</td>\n",
        "      <td>39</td>\n",
        "      <td>10140</td>\n",
-       "      <td>64</td>\n",
        "      <td>97</td>\n",
        "      <td>Friday</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
        "      <td>2024-10-19</td>\n",
-       "      <td>24.532039</td>\n",
-       "      <td>23.604723</td>\n",
-       "      <td>43</td>\n",
        "      <td>147</td>\n",
        "      <td>43</td>\n",
-       "      <td>28</td>\n",
-       "      <td>10140</td>\n",
-       "      <td>236</td>\n",
-       "      <td>92</td>\n",
        "      <td>Saturday</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
        "      <td>2024-10-20</td>\n",
-       "      <td>23.019102</td>\n",
-       "      <td>24.173377</td>\n",
-       "      <td>68</td>\n",
-       "      <td>145</td>\n",
-       "      <td>0</td>\n",
        "      <td>0</td>\n",
        "      <td>10160</td>\n",
-       "      <td>241</td>\n",
-       "      <td>82</td>\n",
        "      <td>Sunday</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>5</th>\n",
        "      <td>2024-10-21</td>\n",
-       "      <td>21.275629</td>\n",
-       "      <td>25.058736</td>\n",
        "      <td>58</td>\n",
        "      <td>144</td>\n",
        "      <td>27</td>\n",
@@ -191,499 +152,120 @@
        "      <td>Monday</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6</th>\n",
        "      <td>2024-10-22</td>\n",
-       "      <td>22.334375</td>\n",
-       "      <td>24.594219</td>\n",
-       "      <td>76</td>\n",
-       "      <td>123</td>\n",
        "      <td>57</td>\n",
-       "      <td>12</td>\n",
-       "      <td>10265</td>\n",
-       "      <td>100</td>\n",
-       "      <td>87</td>\n",
        "      <td>Tuesday</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7</th>\n",
        "      <td>2024-10-23</td>\n",
-       "      <td>24.261733</td>\n",
-       "      <td>23.560000</td>\n",
-       "      <td>31</td>\n",
-       "      <td>115</td>\n",
-       "      <td>7</td>\n",
        "      <td>0</td>\n",
        "      <td>10328</td>\n",
-       "      <td>105</td>\n",
-       "      <td>95</td>\n",
        "      <td>Wednesday</td>\n",
        "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         date        NO2         O3  wind_speed  mean_temp  global_radiation  \\\n",
-       "0  2024-10-16  22.602712  22.881288          61        151                40   \n",
-       "1  2024-10-17  23.104327  23.038638          51        169                43   \n",
-       "2  2024-10-18  23.682857  23.716611          21        156                42   \n",
-       "3  2024-10-19  24.532039  23.604723          43        147                43   \n",
-       "4  2024-10-20  23.019102  24.173377          68        145                 0   \n",
-       "5  2024-10-21  21.275629  25.058736          58        144                27   \n",
-       "6  2024-10-22  22.334375  24.594219          76        123                57   \n",
-       "7  2024-10-23  24.261733  23.560000          31        115                 7   \n",
-       "\n",
-       "   percipitation  pressure  minimum_visibility  humidity    weekday  \n",
-       "0              0     10103                 358        82  Wednesday  \n",
-       "1              6     10100                 371        86   Thursday  \n",
-       "2             39     10140                  64        97     Friday  \n",
-       "3             28     10140                 236        92   Saturday  \n",
-       "4              0     10160                 241        82     Sunday  \n",
-       "5             43     10206                 220        92     Monday  \n",
-       "6             12     10265                 100        87    Tuesday  \n",
-       "7              0     10328                 105        95  Wednesday  "
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Number of rows with missing values dropped: 7\n"
-     ]
-    }
-   ],
-   "source": [
-    "input_data = create_features(\n",
-    "    data=data,\n",
-    "    target_particle=target_particle,\n",
-    "    lag_days=7,\n",
-    "    sma_days=7,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>NO2</th>\n",
-       "      <th>O3</th>\n",
-       "      <th>wind_speed</th>\n",
-       "      <th>mean_temp</th>\n",
-       "      <th>global_radiation</th>\n",
-       "      <th>percipitation</th>\n",
-       "      <th>pressure</th>\n",
-       "      <th>minimum_visibility</th>\n",
-       "      <th>humidity</th>\n",
-       "      <th>weekday_sin</th>\n",
-       "      <th>...</th>\n",
-       "      <th>O3_last_year_4_days_before</th>\n",
-       "      <th>NO2_last_year_4_days_before</th>\n",
-       "      <th>O3_last_year_5_days_before</th>\n",
-       "      <th>NO2_last_year_5_days_before</th>\n",
-       "      <th>O3_last_year_6_days_before</th>\n",
-       "      <th>NO2_last_year_6_days_before</th>\n",
-       "      <th>O3_last_year_7_days_before</th>\n",
-       "      <th>NO2_last_year_7_days_before</th>\n",
-       "      <th>O3_last_year_3_days_after</th>\n",
-       "      <th>NO2_last_year_3_days_after</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>-0.126371</td>\n",
-       "      <td>-0.855455</td>\n",
-       "      <td>-0.206181</td>\n",
-       "      <td>0.082314</td>\n",
-       "      <td>-1.330268</td>\n",
-       "      <td>-0.493936</td>\n",
-       "      <td>1.783274</td>\n",
-       "      <td>2.813837</td>\n",
-       "      <td>1.547919</td>\n",
-       "      <td>1.37753</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-1.036205</td>\n",
-       "      <td>-0.802392</td>\n",
-       "      <td>-0.883032</td>\n",
-       "      <td>-0.968984</td>\n",
-       "      <td>0.333776</td>\n",
-       "      <td>-1.446199</td>\n",
-       "      <td>-1.180992</td>\n",
-       "      <td>-0.54567</td>\n",
-       "      <td>-1.15814</td>\n",
-       "      <td>-0.358079</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>1 rows × 87 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "        NO2        O3  wind_speed  mean_temp  global_radiation  percipitation  \\\n",
-       "0 -0.126371 -0.855455   -0.206181   0.082314         -1.330268      -0.493936   \n",
-       "\n",
-       "   pressure  minimum_visibility  humidity  weekday_sin  ...  \\\n",
-       "0  1.783274            2.813837  1.547919      1.37753  ...   \n",
-       "\n",
-       "   O3_last_year_4_days_before  NO2_last_year_4_days_before  \\\n",
-       "0                   -1.036205                    -0.802392   \n",
-       "\n",
-       "   O3_last_year_5_days_before  NO2_last_year_5_days_before  \\\n",
-       "0                   -0.883032                    -0.968984   \n",
        "\n",
-       "   O3_last_year_6_days_before  NO2_last_year_6_days_before  \\\n",
-       "0                    0.333776                    -1.446199   \n",
-       "\n",
-       "   O3_last_year_7_days_before  NO2_last_year_7_days_before  \\\n",
-       "0                   -1.180992                     -0.54567   \n",
-       "\n",
-       "   O3_last_year_3_days_after  NO2_last_year_3_days_after  \n",
-       "0                   -1.15814                   -0.358079  \n",
-       "\n",
-       "[1 rows x 87 columns]"
       ]
      },
-     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "input_data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#prediction = run_model(particle=\"O3\", data=df)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>date</th>\n",
-       "      <th>NO2</th>\n",
-       "      <th>O3</th>\n",
-       "      <th>wind_speed</th>\n",
-       "      <th>mean_temp</th>\n",
-       "      <th>global_radiation</th>\n",
-       "      <th>percipitation</th>\n",
-       "      <th>pressure</th>\n",
-       "      <th>minimum_visibility</th>\n",
-       "      <th>humidity</th>\n",
-       "      <th>weekday</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>2023-10-16</td>\n",
-       "      <td>17.958784</td>\n",
-       "      <td>32.611400</td>\n",
-       "      <td>31</td>\n",
-       "      <td>90</td>\n",
-       "      <td>68</td>\n",
-       "      <td>9</td>\n",
-       "      <td>1022</td>\n",
-       "      <td>348</td>\n",
-       "      <td>88</td>\n",
-       "      <td>Monday</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2023-10-17</td>\n",
-       "      <td>10.842703</td>\n",
-       "      <td>39.812600</td>\n",
-       "      <td>61</td>\n",
-       "      <td>85</td>\n",
-       "      <td>75</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1019</td>\n",
-       "      <td>348</td>\n",
-       "      <td>84</td>\n",
-       "      <td>Tuesday</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2023-10-18</td>\n",
-       "      <td>17.970267</td>\n",
-       "      <td>31.779024</td>\n",
-       "      <td>71</td>\n",
-       "      <td>90</td>\n",
-       "      <td>71</td>\n",
-       "      <td>23</td>\n",
-       "      <td>1006</td>\n",
-       "      <td>238</td>\n",
-       "      <td>77</td>\n",
-       "      <td>Wednesday</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2023-10-19</td>\n",
-       "      <td>17.233056</td>\n",
-       "      <td>18.715600</td>\n",
-       "      <td>61</td>\n",
-       "      <td>145</td>\n",
-       "      <td>39</td>\n",
-       "      <td>114</td>\n",
-       "      <td>990</td>\n",
-       "      <td>212</td>\n",
-       "      <td>94</td>\n",
-       "      <td>Thursday</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>2023-10-20</td>\n",
-       "      <td>15.023600</td>\n",
-       "      <td>22.040000</td>\n",
-       "      <td>71</td>\n",
-       "      <td>119</td>\n",
-       "      <td>7</td>\n",
-       "      <td>204</td>\n",
-       "      <td>981</td>\n",
-       "      <td>104</td>\n",
-       "      <td>97</td>\n",
-       "      <td>Friday</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>2023-10-21</td>\n",
-       "      <td>8.723378</td>\n",
-       "      <td>48.334400</td>\n",
-       "      <td>61</td>\n",
-       "      <td>131</td>\n",
-       "      <td>39</td>\n",
-       "      <td>35</td>\n",
-       "      <td>989</td>\n",
-       "      <td>277</td>\n",
-       "      <td>88</td>\n",
-       "      <td>Saturday</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>2023-10-22</td>\n",
-       "      <td>20.634267</td>\n",
-       "      <td>15.586000</td>\n",
-       "      <td>71</td>\n",
-       "      <td>121</td>\n",
-       "      <td>55</td>\n",
-       "      <td>39</td>\n",
-       "      <td>1003</td>\n",
-       "      <td>323</td>\n",
-       "      <td>87</td>\n",
-       "      <td>Sunday</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>2023-10-23</td>\n",
-       "      <td>15.115600</td>\n",
-       "      <td>24.628085</td>\n",
-       "      <td>50</td>\n",
-       "      <td>99</td>\n",
-       "      <td>43</td>\n",
-       "      <td>5</td>\n",
-       "      <td>1011</td>\n",
-       "      <td>59</td>\n",
-       "      <td>95</td>\n",
-       "      <td>Monday</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>2023-10-24</td>\n",
-       "      <td>22.885676</td>\n",
-       "      <td>27.117600</td>\n",
-       "      <td>61</td>\n",
-       "      <td>116</td>\n",
-       "      <td>32</td>\n",
-       "      <td>65</td>\n",
-       "      <td>1001</td>\n",
-       "      <td>231</td>\n",
-       "      <td>92</td>\n",
-       "      <td>Tuesday</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>2023-10-25</td>\n",
-       "      <td>21.531757</td>\n",
-       "      <td>13.321600</td>\n",
-       "      <td>50</td>\n",
-       "      <td>93</td>\n",
-       "      <td>14</td>\n",
-       "      <td>153</td>\n",
-       "      <td>996</td>\n",
-       "      <td>157</td>\n",
-       "      <td>96</td>\n",
-       "      <td>Wednesday</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>2023-10-26</td>\n",
-       "      <td>23.072267</td>\n",
-       "      <td>16.154167</td>\n",
-       "      <td>31</td>\n",
-       "      <td>94</td>\n",
-       "      <td>36</td>\n",
-       "      <td>1</td>\n",
-       "      <td>995</td>\n",
-       "      <td>48</td>\n",
-       "      <td>97</td>\n",
-       "      <td>Thursday</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "         date        NO2         O3  wind_speed  mean_temp  global_radiation  \\\n",
-       "0  2023-10-16  17.958784  32.611400          31         90                68   \n",
-       "1  2023-10-17  10.842703  39.812600          61         85                75   \n",
-       "2  2023-10-18  17.970267  31.779024          71         90                71   \n",
-       "3  2023-10-19  17.233056  18.715600          61        145                39   \n",
-       "4  2023-10-20  15.023600  22.040000          71        119                 7   \n",
-       "5  2023-10-21   8.723378  48.334400          61        131                39   \n",
-       "6  2023-10-22  20.634267  15.586000          71        121                55   \n",
-       "7  2023-10-23  15.115600  24.628085          50         99                43   \n",
-       "8  2023-10-24  22.885676  27.117600          61        116                32   \n",
-       "9  2023-10-25  21.531757  13.321600          50         93                14   \n",
-       "10 2023-10-26  23.072267  16.154167          31         94                36   \n",
-       "\n",
-       "    percipitation  pressure  minimum_visibility  humidity    weekday  \n",
-       "0               9      1022                 348        88     Monday  \n",
-       "1               0      1019                 348        84    Tuesday  \n",
-       "2              23      1006                 238        77  Wednesday  \n",
-       "3             114       990                 212        94   Thursday  \n",
-       "4             204       981                 104        97     Friday  \n",
-       "5              35       989                 277        88   Saturday  \n",
-       "6              39      1003                 323        87     Sunday  \n",
-       "7               5      1011                  59        95     Monday  \n",
-       "8              65      1001                 231        92    Tuesday  \n",
-       "9             153       996                 157        96  Wednesday  \n",
-       "10              1       995                  48        97   Thursday  "
       ]
      },
-     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "get_past_data()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-10-23 19:40:20.321 Thread 'MainThread': missing ScriptRunContext! This warning can be ignored when running in bare mode.\n",
-      "2024-10-23 19:40:20.322 Thread 'MainThread': missing ScriptRunContext! This warning can be ignored when running in bare mode.\n",
-      "2024-10-23 19:40:20.323 Thread 'MainThread': missing ScriptRunContext! This warning can be ignored when running in bare mode.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Number of rows with missing values dropped: 7\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-10-23 19:40:34.183 Thread 'MainThread': missing ScriptRunContext! This warning can be ignored when running in bare mode.\n",
-      "2024-10-23 19:40:34.184 Thread 'MainThread': missing ScriptRunContext! This warning can be ignored when running in bare mode.\n"
-     ]
-    }
-   ],
-   "source": [
-    "prediction=run_model(particle=target_particle, data=data)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "array([[19.90814701,  8.8039613 , 26.57711386]])"
       ]
      },
-     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "prediction"
    ]
   }
  ],

     }
    ],
    "source": [
+    "from src.predict import get_data_and_predictions\n",
+    "from src.data_api_calls import get_combined_data\n",
+    "from src.past_data_api_calls import get_past_combined_data"
    ]
   },
   {
    "metadata": {},
    "outputs": [
     {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data is already up to date.\n",
+      "Data is already up to date.\n",
+      "Number of rows with missing values dropped: 7\n",
+      "Data is already up to date.\n",
+      "Number of rows with missing values dropped: 7\n"
      ]
     }
    ],
     "week_data, predictions_O3, predictions_NO2 = get_data_and_predictions()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "data": {
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>2024-10-17</td>\n",
+       "      <td>22.804605</td>\n",
+       "      <td>22.769160</td>\n",
        "      <td>51</td>\n",
        "      <td>169</td>\n",
        "      <td>43</td>\n",
        "      <td>Thursday</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1</th>\n",
        "      <td>2024-10-18</td>\n",
+       "      <td>23.268500</td>\n",
+       "      <td>23.307332</td>\n",
        "      <td>21</td>\n",
+       "      <td>155</td>\n",
        "      <td>42</td>\n",
        "      <td>39</td>\n",
        "      <td>10140</td>\n",
+       "      <td>45</td>\n",
        "      <td>97</td>\n",
        "      <td>Friday</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>2</th>\n",
        "      <td>2024-10-19</td>\n",
+       "      <td>23.910064</td>\n",
+       "      <td>23.171714</td>\n",
+       "      <td>41</td>\n",
        "      <td>147</td>\n",
        "      <td>43</td>\n",
+       "      <td>16</td>\n",
+       "      <td>10141</td>\n",
+       "      <td>228</td>\n",
+       "      <td>89</td>\n",
        "      <td>Saturday</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>3</th>\n",
        "      <td>2024-10-20</td>\n",
+       "      <td>22.573238</td>\n",
+       "      <td>23.537845</td>\n",
+       "      <td>81</td>\n",
+       "      <td>155</td>\n",
        "      <td>0</td>\n",
+       "      <td>5</td>\n",
        "      <td>10160</td>\n",
+       "      <td>415</td>\n",
+       "      <td>83</td>\n",
        "      <td>Sunday</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>4</th>\n",
        "      <td>2024-10-21</td>\n",
+       "      <td>21.145700</td>\n",
+       "      <td>24.020696</td>\n",
        "      <td>58</td>\n",
        "      <td>144</td>\n",
        "      <td>27</td>\n",
        "      <td>Monday</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>5</th>\n",
        "      <td>2024-10-22</td>\n",
+       "      <td>21.776580</td>\n",
+       "      <td>23.335886</td>\n",
+       "      <td>53</td>\n",
+       "      <td>114</td>\n",
        "      <td>57</td>\n",
+       "      <td>49</td>\n",
+       "      <td>10269</td>\n",
+       "      <td>226</td>\n",
+       "      <td>92</td>\n",
        "      <td>Tuesday</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>6</th>\n",
        "      <td>2024-10-23</td>\n",
+       "      <td>21.974794</td>\n",
+       "      <td>22.214689</td>\n",
+       "      <td>36</td>\n",
+       "      <td>112</td>\n",
+       "      <td>12</td>\n",
        "      <td>0</td>\n",
        "      <td>10328</td>\n",
+       "      <td>65</td>\n",
+       "      <td>97</td>\n",
        "      <td>Wednesday</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>2024-10-24</td>\n",
+       "      <td>25.512568</td>\n",
+       "      <td>20.913710</td>\n",
+       "      <td>56</td>\n",
+       "      <td>104</td>\n",
+       "      <td>62</td>\n",
+       "      <td>0</td>\n",
+       "      <td>10247</td>\n",
+       "      <td>130</td>\n",
+       "      <td>94</td>\n",
+       "      <td>Thursday</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
+       "        date        NO2         O3  wind_speed  mean_temp  global_radiation  \\\n",
+       "0 2024-10-17  22.804605  22.769160          51        169                43   \n",
+       "1 2024-10-18  23.268500  23.307332          21        155                42   \n",
+       "2 2024-10-19  23.910064  23.171714          41        147                43   \n",
+       "3 2024-10-20  22.573238  23.537845          81        155                 0   \n",
+       "4 2024-10-21  21.145700  24.020696          58        144                27   \n",
+       "5 2024-10-22  21.776580  23.335886          53        114                57   \n",
+       "6 2024-10-23  21.974794  22.214689          36        112                12   \n",
+       "7 2024-10-24  25.512568  20.913710          56        104                62   \n",
        "\n",
+       "   percipitation  pressure  minimum_visibility  humidity    weekday  \n",
+       "0              6     10100                 371        86   Thursday  \n",
+       "1             39     10140                  45        97     Friday  \n",
+       "2             16     10141                 228        89   Saturday  \n",
+       "3              5     10160                 415        83     Sunday  \n",
+       "4             43     10206                 220        92     Monday  \n",
+       "5             49     10269                 226        92    Tuesday  \n",
+       "6              0     10328                  65        97  Wednesday  \n",
+       "7              0     10247                 130        94   Thursday  "
       ]
      },
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "week_data"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "array([[10.33808859, 16.00098432, 19.64377496]])"
       ]
      },
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "predictions_O3"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "array([[25.68519992, 25.76030745, 31.21057679]])"
       ]
      },
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "predictions_NO2"
    ]
   }
  ],