datagouv-french-data-analyst / reexport_data.py
axel-darmouni's picture
update
2508004
import pandas as pd
# Read the CSV file
df = pd.read_csv('dataset_metadata/full_dts_list.csv', sep=";")
# Print all columns
print("Columns in the dataset:")
print(df.columns.tolist())
print("\n" + "="*50 + "\n")
# Print unique values for license column
print("Unique values in 'license' column:")
if 'license' in df.columns:
unique_licences = df['license'].unique()
for i, licence in enumerate(unique_licences, 1):
print(f"{i}. {licence}")
print(f"\nTotal unique license values: {len(unique_licences)}")
# Also show value counts for license column
print("\nLicense value counts:")
print(df['license'].value_counts())
else:
print("Column 'license' not found in the dataset.")
print("Available columns are:", df.columns.tolist())
print("\n" + "="*50 + "\n")
# Select only the required columns
required_columns = ["title", "url", "license", "quality_score"]
# Check which columns exist
existing_columns = [col for col in required_columns if col in df.columns]
missing_columns = [col for col in required_columns if col not in df.columns]
print(f"Found columns: {existing_columns}")
if missing_columns:
print(f"Missing columns: {missing_columns}")
# Select only existing columns
df_filtered = df[existing_columns].copy()
print(f"\nOriginal dataset shape: {df.shape}")
print(f"After selecting columns: {df_filtered.shape}")
# Filter out rows where license is NaN
df_filtered = df_filtered.dropna(subset=['license'])
print(f"After removing NaN license values: {df_filtered.shape}")
# # Filter for datasets that include France in spatial granularity
# if 'spatial.zones' in df_filtered.columns:
# # Check unique values in spatial.granularity before filtering
# print(f"\nUnique values in 'spatial.zones' column (first 10):")
# unique_spatial = df_filtered['spatial.zones'].dropna().unique()
# for i, value in enumerate(unique_spatial[:10], 1):
# print(f"{i}. {value}")
# if len(unique_spatial) > 10:
# print(f"... and {len(unique_spatial) - 10} more values")
#
# # Filter for France (case-insensitive search)
# france_filter = df_filtered['spatial.zones'].str.contains('France', case=False, na=False)
# df_filtered = df_filtered[france_filter]
#
# print(f"After filtering for France in spatial zones: {df_filtered.shape}")
# else:
# print("Warning: 'spatial.zones' column not found, skipping France filter")
# Filter by quality score (keep only > 0.6)
if 'quality_score' in df_filtered.columns:
print(f"Before quality filtering: {df_filtered.shape}")
df_filtered = df_filtered[df_filtered['quality_score'] >= 0.8]
print(f"After filtering quality_score >= 0.8: {df_filtered.shape}")
# Sort by quality score (descending order - highest quality first)
df_filtered = df_filtered.sort_values('quality_score', ascending=False)
print(f"Dataset sorted by quality_score (highest first)")
# Show quality score distribution
if not df_filtered.empty:
print(f"Quality score range: {df_filtered['quality_score'].min():.2f} - {df_filtered['quality_score'].max():.2f}")
else:
print("Warning: 'quality_score' column not found, skipping quality filtering and sorting")
# Save to CSV
# Drop license column before saving
df_filtered = df_filtered.drop('license', axis=1)
output_file = 'filtered_dataset.csv'
df_filtered.to_csv(output_file, index=False)
print(f"\nFiltered dataset saved to: {output_file}")
print(f"Final dataset contains {len(df_filtered)} rows and {len(df_filtered.columns)} columns")
print("\n" + "="*50)
print("βœ… Pre-processing complete!")
print("Files created:")
print(f" - {output_file}: Filtered dataset")
print("="*50)