File size: 3,706 Bytes
2508004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd

# Read the CSV file
df = pd.read_csv('dataset_metadata/full_dts_list.csv', sep=";")

# Print all columns
print("Columns in the dataset:")
print(df.columns.tolist())
print("\n" + "="*50 + "\n")

# Print unique values for license column
print("Unique values in 'license' column:")
if 'license' in df.columns:
    unique_licences = df['license'].unique()
    for i, licence in enumerate(unique_licences, 1):
        print(f"{i}. {licence}")
    
    print(f"\nTotal unique license values: {len(unique_licences)}")
    
    # Also show value counts for license column
    print("\nLicense value counts:")
    print(df['license'].value_counts())
else:
    print("Column 'license' not found in the dataset.")
    print("Available columns are:", df.columns.tolist())

print("\n" + "="*50 + "\n")

# Select only the required columns
required_columns = ["title", "url", "license", "quality_score"]

# Check which columns exist
existing_columns = [col for col in required_columns if col in df.columns]
missing_columns = [col for col in required_columns if col not in df.columns]

print(f"Found columns: {existing_columns}")
if missing_columns:
    print(f"Missing columns: {missing_columns}")

# Select only existing columns
df_filtered = df[existing_columns].copy()

print(f"\nOriginal dataset shape: {df.shape}")
print(f"After selecting columns: {df_filtered.shape}")

# Filter out rows where license is NaN
df_filtered = df_filtered.dropna(subset=['license'])

print(f"After removing NaN license values: {df_filtered.shape}")
# # Filter for datasets that include France in spatial granularity
# if 'spatial.zones' in df_filtered.columns:
#     # Check unique values in spatial.granularity before filtering
#     print(f"\nUnique values in 'spatial.zones' column (first 10):")
#     unique_spatial = df_filtered['spatial.zones'].dropna().unique()
#     for i, value in enumerate(unique_spatial[:10], 1):
#         print(f"{i}. {value}")
#     if len(unique_spatial) > 10:
#         print(f"... and {len(unique_spatial) - 10} more values")
#     
#     # Filter for France (case-insensitive search)
#     france_filter = df_filtered['spatial.zones'].str.contains('France', case=False, na=False)
#     df_filtered = df_filtered[france_filter]
#     
#     print(f"After filtering for France in spatial zones: {df_filtered.shape}")
# else:
#     print("Warning: 'spatial.zones' column not found, skipping France filter")

# Filter by quality score (keep only > 0.6)
if 'quality_score' in df_filtered.columns:
    print(f"Before quality filtering: {df_filtered.shape}")
    df_filtered = df_filtered[df_filtered['quality_score'] >= 0.8]
    print(f"After filtering quality_score >= 0.8: {df_filtered.shape}")
    
    # Sort by quality score (descending order - highest quality first)
    df_filtered = df_filtered.sort_values('quality_score', ascending=False)
    print(f"Dataset sorted by quality_score (highest first)")
    
    # Show quality score distribution
    if not df_filtered.empty:
        print(f"Quality score range: {df_filtered['quality_score'].min():.2f} - {df_filtered['quality_score'].max():.2f}")
else:
    print("Warning: 'quality_score' column not found, skipping quality filtering and sorting")

# Save to CSV
# Drop license column before saving
df_filtered = df_filtered.drop('license', axis=1)
output_file = 'filtered_dataset.csv'
df_filtered.to_csv(output_file, index=False)

print(f"\nFiltered dataset saved to: {output_file}")
print(f"Final dataset contains {len(df_filtered)} rows and {len(df_filtered.columns)} columns")

print("\n" + "="*50)
print("✅ Pre-processing complete!")
print("Files created:")
print(f"  - {output_file}: Filtered dataset")
print("="*50)