|
import pandas as pd |
|
|
|
|
|
df = pd.read_csv('dataset_metadata/full_dts_list.csv', sep=";") |
|
|
|
|
|
print("Columns in the dataset:") |
|
print(df.columns.tolist()) |
|
print("\n" + "="*50 + "\n") |
|
|
|
|
|
print("Unique values in 'license' column:") |
|
if 'license' in df.columns: |
|
unique_licences = df['license'].unique() |
|
for i, licence in enumerate(unique_licences, 1): |
|
print(f"{i}. {licence}") |
|
|
|
print(f"\nTotal unique license values: {len(unique_licences)}") |
|
|
|
|
|
print("\nLicense value counts:") |
|
print(df['license'].value_counts()) |
|
else: |
|
print("Column 'license' not found in the dataset.") |
|
print("Available columns are:", df.columns.tolist()) |
|
|
|
print("\n" + "="*50 + "\n") |
|
|
|
|
|
required_columns = ["title", "url", "license", "quality_score"] |
|
|
|
|
|
existing_columns = [col for col in required_columns if col in df.columns] |
|
missing_columns = [col for col in required_columns if col not in df.columns] |
|
|
|
print(f"Found columns: {existing_columns}") |
|
if missing_columns: |
|
print(f"Missing columns: {missing_columns}") |
|
|
|
|
|
df_filtered = df[existing_columns].copy() |
|
|
|
print(f"\nOriginal dataset shape: {df.shape}") |
|
print(f"After selecting columns: {df_filtered.shape}") |
|
|
|
|
|
df_filtered = df_filtered.dropna(subset=['license']) |
|
|
|
print(f"After removing NaN license values: {df_filtered.shape}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if 'quality_score' in df_filtered.columns: |
|
print(f"Before quality filtering: {df_filtered.shape}") |
|
df_filtered = df_filtered[df_filtered['quality_score'] >= 0.8] |
|
print(f"After filtering quality_score >= 0.8: {df_filtered.shape}") |
|
|
|
|
|
df_filtered = df_filtered.sort_values('quality_score', ascending=False) |
|
print(f"Dataset sorted by quality_score (highest first)") |
|
|
|
|
|
if not df_filtered.empty: |
|
print(f"Quality score range: {df_filtered['quality_score'].min():.2f} - {df_filtered['quality_score'].max():.2f}") |
|
else: |
|
print("Warning: 'quality_score' column not found, skipping quality filtering and sorting") |
|
|
|
|
|
|
|
df_filtered = df_filtered.drop('license', axis=1) |
|
output_file = 'filtered_dataset.csv' |
|
df_filtered.to_csv(output_file, index=False) |
|
|
|
print(f"\nFiltered dataset saved to: {output_file}") |
|
print(f"Final dataset contains {len(df_filtered)} rows and {len(df_filtered.columns)} columns") |
|
|
|
print("\n" + "="*50) |
|
print("β
Pre-processing complete!") |
|
print("Files created:") |
|
print(f" - {output_file}: Filtered dataset") |
|
print("="*50) |
|
|