Spaces:

Mihkelmj
/

utrecht-pollution-prediction

Sleeping

App Files Files Community

utrecht-pollution-prediction / src /data_api_calls.py

elisaklunder

stupid api

3dd6a8c about 1 year ago

raw

history blame

7.09 kB

	import codecs
	import csv
	import http.client
	import os
	import re
	import sys
	import urllib.request
	from datetime import date, timedelta
	from io import StringIO

	import pandas as pd

	WEATHER_DATA_FILE = "weather_data.csv"
	POLLUTION_DATA_FILE = "pollution_data.csv"


	def update_weather_data():
	today = date.today().isoformat()

	if os.path.exists(WEATHER_DATA_FILE):
	df = pd.read_csv(WEATHER_DATA_FILE)
	last_date = pd.to_datetime(df["date"]).max()
	start_date = (last_date + timedelta(1)).isoformat()
	else:
	df = pd.DataFrame()
	start_date = (date.today() - timedelta(7)).isoformat()

	try:
	ResultBytes = urllib.request.urlopen(
	f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Utrecht/{start_date}/{today}?unitGroup=metric&elements=datetime%2Cwindspeed%2Ctemp%2Csolarradiation%2Cprecip%2Cpressure%2Cvisibility%2Chumidity&include=days&key=7Y6AY56M6RWVNHQ3SAVHNJWFS&maxStations=1&contentType=csv"
	)
	CSVText = csv.reader(codecs.iterdecode(ResultBytes, "utf-8"))

	new_data = pd.DataFrame(list(CSVText))
	new_data.columns = new_data.iloc[0]
	new_data = new_data[1:]
	new_data = new_data.rename(columns={"datetime": "date"})

	updated_df = pd.concat([df, new_data], ignore_index=True)
	updated_df.drop_duplicates(subset="date", keep="last", inplace=True)
	updated_df.to_csv(WEATHER_DATA_FILE, index=False)

	except urllib.error.HTTPError as e:
	ErrorInfo = e.read().decode()
	print("Error code: ", e.code, ErrorInfo)
	sys.exit()
	except urllib.error.URLError as e:
	ErrorInfo = e.read().decode()
	print("Error code: ", e.code, ErrorInfo)
	sys.exit()


	def update_pollution_data():
	O3 = []
	NO2 = []
	particles = ["NO2", "O3"]
	stations = ["NL10636", "NL10639", "NL10643"]
	all_dataframes = []
	today = date.today().isoformat() + "T09:00:00Z"
	yesterday = (date.today() - timedelta(1)).isoformat() + "T09:00:00Z"
	latest_date = (date.today() - timedelta(8)).isoformat() + "T09:00:00Z"
	days_today = 0
	days_yesterday = 1
	while today != latest_date:
	days_today += 1
	days_yesterday += 1
	for particle in particles:
	for station in stations:
	conn = http.client.HTTPSConnection("api.luchtmeetnet.nl")
	payload = ""
	headers = {}
	conn.request(
	"GET",
	f"/open_api/measurements?station_number={station}&formula={particle}&page=1&order_by=timestamp_measured&order_direction=desc&end={today}&start={yesterday}",
	payload,
	headers,
	)
	res = conn.getresponse()
	data = res.read()
	decoded_data = data.decode("utf-8")
	df = pd.read_csv(StringIO(decoded_data))
	df = df.filter(like="value")
	all_dataframes.append(df)
	combined_data = pd.concat(all_dataframes, ignore_index=True)
	values = []
	for row in combined_data:
	cleaned_value = re.findall(r"[-+]?\d*\.\d+\|\d+", row)
	if cleaned_value: # If we successfully extract a number
	values.append(
	float(cleaned_value[0])
	) # Convert the first match to float

	# Compute the average if the values list is not empty
	if values:
	avg = sum(values) / len(values)
	if particle == "NO2":
	NO2.append(avg)
	else:
	O3.append(avg)
	today = (date.today() - timedelta(days_today)).isoformat() + "T09:00:00Z"
	yesterday = (
	date.today() - timedelta(days_yesterday)
	).isoformat() + "T09:00:00Z"

	avg_combined_data = pd.DataFrame(
	{
	"date": pd.date_range(end=date.today(), periods=len(NO2)),
	"NO2": NO2,
	"O3": O3,
	}
	)

	avg_combined_data = reverse_pollution(NO2, O3, avg_combined_data)

	if os.path.exists(POLLUTION_DATA_FILE):
	existing_data = pd.read_csv(POLLUTION_DATA_FILE)
	last_date = pd.to_datetime(existing_data["date"]).max()
	new_data = avg_combined_data[avg_combined_data["date"] > last_date]
	updated_data = pd.concat([existing_data, new_data], ignore_index=True)
	updated_data.drop_duplicates(subset="date", keep="last", inplace=True)
	else:
	updated_data = avg_combined_data

	updated_data.to_csv(POLLUTION_DATA_FILE, index=False)


	def reverse_pollution(NO2, O3, data):
	df = data
	start_index = 0
	while NO2:
	df.loc[start_index, "NO2"] = NO2.pop()
	start_index += 1
	start_index = 0
	while O3:
	df.loc[start_index, "O3"] = O3.pop()
	start_index += 1
	return df


	def get_combined_data():
	update_weather_data()
	update_pollution_data()

	weather_df = pd.read_csv(WEATHER_DATA_FILE)

	weather_df.insert(1, "NO2", None)
	weather_df.insert(2, "O3", None)
	weather_df.insert(10, "weekday", None)
	columns = list(weather_df.columns)
	columns.insert(3, columns.pop(6))
	weather_df = weather_df[columns]
	columns.insert(5, columns.pop(9))
	weather_df = weather_df[columns]
	columns.insert(9, columns.pop(6))
	weather_df = weather_df[columns]

	combined_df = weather_df

	# Apply scaling and renaming similar to the scale function from previous code
	combined_df = combined_df.rename(
	columns={
	"date": "date",
	"windspeed": "wind_speed",
	"temp": "mean_temp",
	"solarradiation": "global_radiation",
	"precip": "percipitation",
	"sealevelpressure": "pressure",
	"visibility": "minimum_visibility",
	}
	)

	combined_df["date"] = pd.to_datetime(combined_df["date"])
	combined_df["weekday"] = combined_df["date"].dt.day_name()

	combined_df["wind_speed"] = (combined_df["wind_speed"] / 3.6) * 10
	combined_df["mean_temp"] = combined_df["mean_temp"] * 10
	combined_df["minimum_visibility"] = combined_df["minimum_visibility"] * 10
	combined_df["percipitation"] = combined_df["percipitation"] * 10
	combined_df["pressure"] = combined_df["pressure"] * 10

	combined_df["wind_speed"] = combined_df["wind_speed"].astype(int)
	combined_df["mean_temp"] = combined_df["mean_temp"].astype(int)
	combined_df["minimum_visibility"] = combined_df["minimum_visibility"].astype(int)
	combined_df["percipitation"] = combined_df["percipitation"].astype(int)
	combined_df["pressure"] = combined_df["pressure"].astype(int)
	combined_df["humidity"] = combined_df["humidity"].astype(int)
	combined_df["global_radiation"] = combined_df["global_radiation"].astype(int)

	pollution_df = pd.read_csv(POLLUTION_DATA_FILE)
	combined_df["NO2"] = pollution_df["NO2"]
	combined_df["O3"] = pollution_df["O3"]

	return combined_df