import pandas as pd # Read the CSV file df = pd.read_csv('dataset_metadata/full_dts_list.csv', sep=";") # Print all columns print("Columns in the dataset:") print(df.columns.tolist()) print("\n" + "="*50 + "\n") # Print unique values for license column print("Unique values in 'license' column:") if 'license' in df.columns: unique_licences = df['license'].unique() for i, licence in enumerate(unique_licences, 1): print(f"{i}. {licence}") print(f"\nTotal unique license values: {len(unique_licences)}") # Also show value counts for license column print("\nLicense value counts:") print(df['license'].value_counts()) else: print("Column 'license' not found in the dataset.") print("Available columns are:", df.columns.tolist()) print("\n" + "="*50 + "\n") # Select only the required columns required_columns = ["title", "url", "license", "quality_score"] # Check which columns exist existing_columns = [col for col in required_columns if col in df.columns] missing_columns = [col for col in required_columns if col not in df.columns] print(f"Found columns: {existing_columns}") if missing_columns: print(f"Missing columns: {missing_columns}") # Select only existing columns df_filtered = df[existing_columns].copy() print(f"\nOriginal dataset shape: {df.shape}") print(f"After selecting columns: {df_filtered.shape}") # Filter out rows where license is NaN df_filtered = df_filtered.dropna(subset=['license']) print(f"After removing NaN license values: {df_filtered.shape}") # # Filter for datasets that include France in spatial granularity # if 'spatial.zones' in df_filtered.columns: # # Check unique values in spatial.granularity before filtering # print(f"\nUnique values in 'spatial.zones' column (first 10):") # unique_spatial = df_filtered['spatial.zones'].dropna().unique() # for i, value in enumerate(unique_spatial[:10], 1): # print(f"{i}. {value}") # if len(unique_spatial) > 10: # print(f"... and {len(unique_spatial) - 10} more values") # # # Filter for France (case-insensitive search) # france_filter = df_filtered['spatial.zones'].str.contains('France', case=False, na=False) # df_filtered = df_filtered[france_filter] # # print(f"After filtering for France in spatial zones: {df_filtered.shape}") # else: # print("Warning: 'spatial.zones' column not found, skipping France filter") # Filter by quality score (keep only > 0.6) if 'quality_score' in df_filtered.columns: print(f"Before quality filtering: {df_filtered.shape}") df_filtered = df_filtered[df_filtered['quality_score'] >= 0.8] print(f"After filtering quality_score >= 0.8: {df_filtered.shape}") # Sort by quality score (descending order - highest quality first) df_filtered = df_filtered.sort_values('quality_score', ascending=False) print(f"Dataset sorted by quality_score (highest first)") # Show quality score distribution if not df_filtered.empty: print(f"Quality score range: {df_filtered['quality_score'].min():.2f} - {df_filtered['quality_score'].max():.2f}") else: print("Warning: 'quality_score' column not found, skipping quality filtering and sorting") # Save to CSV # Drop license column before saving df_filtered = df_filtered.drop('license', axis=1) output_file = 'filtered_dataset.csv' df_filtered.to_csv(output_file, index=False) print(f"\nFiltered dataset saved to: {output_file}") print(f"Final dataset contains {len(df_filtered)} rows and {len(df_filtered.columns)} columns") print("\n" + "="*50) print("✅ Pre-processing complete!") print("Files created:") print(f" - {output_file}: Filtered dataset") print("="*50)