import pandas as pd import re # ============================================================================= # CONFIGURATION - Update these paths as needed # ============================================================================= ASR_RESULTS_PATH = "asr_results (2).csv" SUBMISSION_PATH = "75.csv" # ============================================================================= # LOAD DATA # ============================================================================= print("Loading data files...") df_asr = pd.read_csv(ASR_RESULTS_PATH) df_submission = pd.read_csv(SUBMISSION_PATH) # ============================================================================= # FUNCTION DEFINITIONS # ============================================================================= def has_request_time(text): """ Check if text contains time request pattern like "ขอเวลา ... นาที" or "ขอเวลา ... ชั่วโมง" Args: text: Input text to search Returns: bool: True if pattern is found, False otherwise """ return bool(re.search(r"ขอเวลา\s*\d+\s*(นาที|ชั่วโมง)", str(text))) def name_match(row, transcription): """ Check if both first and last name appear in transcription Args: row: DataFrame row containing first_name and last_name transcription: Text to search in Returns: bool: True if both names are found, False otherwise """ first = str(row["first_name"]) last = str(row["last_name"]) text = str(transcription) return (first in text) and (last in text) # ============================================================================= # TASK 1: Find rows containing "สวัสดี" (greeting) # ============================================================================= print("\n=== TASK 1: Finding greeting patterns ===") rows_with_sawasdee = df_asr[ df_asr.apply(lambda row: row.astype(str).str.contains("สวัสดี").any(), axis=1) ] print(f"Found {len(rows_with_sawasdee)} rows with greeting patterns") print(rows_with_sawasdee) # Update submission file for greeting column matching_ids_greeting = set(rows_with_sawasdee["id"]) mask_greeting = df_submission["id"].isin(matching_ids_greeting) & ( df_submission["กล่าวสวัสดี"] == False ) num_changed_greeting = mask_greeting.sum() # Apply changes to submission file df_submission.loc[mask_greeting, "กล่าวสวัสดี"] = True print(f"Number of rows updated for greeting: {num_changed_greeting}") # ============================================================================= # TASK 2: Find rows with time request patterns # ============================================================================= print("\n=== TASK 2: Finding time request patterns ===") rows_with_request_time = df_asr[df_asr["transcription"].apply(has_request_time)] print(f"Found {len(rows_with_request_time)} rows with time request patterns") print(rows_with_request_time[["id", "transcription"]]) # Update submission file for time request column request_time_ids = set(rows_with_request_time["id"]) mask_time = df_submission["id"].isin(request_time_ids) & ( df_submission["บอกระยะเวลาที่ใช้ในการเข้าพบ"] == False ) num_changed_time = mask_time.sum() # Apply changes to submission file df_submission.loc[mask_time, "บอกระยะเวลาที่ใช้ในการเข้าพบ"] = True print(f"Number of rows updated for time request: {num_changed_time}") # ============================================================================= # TASK 3: Name matching analysis # ============================================================================= print("\n=== TASK 3: Name matching analysis ===") # Find name matches using the updated submission file matched_rows = [] for _, sub_row in df_submission.iterrows(): # Find all rows in df_asr where both first and last name appear in transcription matches = df_asr[df_asr["transcription"].apply(lambda t: name_match(sub_row, t))] for _, asr_row in matches.iterrows(): matched_rows.append( { "id": asr_row["id"], "first_name": sub_row["first_name"], "last_name": sub_row["last_name"], "transcription": asr_row["transcription"], } ) # Convert to DataFrame and display results df_matched = pd.DataFrame(matched_rows) print(f"Found {len(df_matched)} name matches") print(df_matched) # ============================================================================= # SAVE UPDATED SUBMISSION FILE # ============================================================================= print("\n=== SAVING UPDATED SUBMISSION FILE ===") df_submission.to_csv(SUBMISSION_PATH, index=False) print(f"Updated submission file saved as: {SUBMISSION_PATH}") # ============================================================================= # SUMMARY # ============================================================================= print("\n=== PROCESSING SUMMARY ===") print(f"✓ Greeting patterns found: {len(rows_with_sawasdee)}") print(f"✓ Greeting updates applied: {num_changed_greeting}") print(f"✓ Time request patterns found: {len(rows_with_request_time)}") print(f"✓ Time request updates applied: {num_changed_time}") print(f"✓ Name matches found: {len(df_matched)}") print(f"✓ All changes applied to: {SUBMISSION_PATH}") print(f"✓ Source data from: {ASR_RESULTS_PATH}")