FauziIsyrinApridal commited on
Commit
174c308
·
1 Parent(s): 7add442
Files changed (1) hide show
  1. scrapping/utils/supabase_utils.py +85 -12
scrapping/utils/supabase_utils.py CHANGED
@@ -19,14 +19,27 @@ def _list_names(supabase, bucket: str) -> List[str]:
19
 
20
 
21
  def _extract_prefix_and_match_pattern(filename: str):
22
- # Expect filenames like: <prefix>_YYYYMMDD_HHMMSS.txt
23
- m = re.match(r"^(.*)_(\d{8}_\d{6})\.txt$", filename)
24
- if not m:
25
- # fallback: treat entire name (without extension) as prefix
26
- base = filename.rsplit('.', 1)[0]
27
- return base, rf"^{re.escape(base)}_\d{{8}}_\d{{6}}\.txt$"
28
- prefix = m.group(1)
29
- pattern = rf"^{re.escape(prefix)}_\d{{8}}_\d{{6}}\.txt$"
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  return prefix, pattern
31
 
32
 
@@ -36,8 +49,15 @@ def _pick_latest_name(names: List[str], pattern: str) -> Optional[str]:
36
  return None
37
 
38
  def ts_key(name: str):
39
- m = re.search(r"_(\d{8}_\d{6})\.txt$", name)
40
- return m.group(1) if m else "00000000_000000"
 
 
 
 
 
 
 
41
 
42
  matched.sort(key=ts_key, reverse=True)
43
  return matched[0]
@@ -53,6 +73,38 @@ def _download_text(supabase, bucket: str, name: str) -> Optional[str]:
53
  return None
54
 
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def upload_if_changed(supabase, bucket: str, filename: str, content: Union[str, bytes]) -> Dict[str, str]:
57
  """Upload file only if content differs from latest existing file with the same prefix.
58
 
@@ -67,18 +119,39 @@ def upload_if_changed(supabase, bucket: str, filename: str, content: Union[str,
67
  payload = content.encode('utf-8')
68
 
69
  prefix, pattern = _extract_prefix_and_match_pattern(filename)
 
 
 
70
  names = _list_names(supabase, bucket)
 
 
71
  latest = _pick_latest_name(names, pattern)
72
  if latest:
 
73
  old_text = _download_text(supabase, bucket, latest)
74
- if old_text is not None and old_text == text:
75
- return {"result": "skipped"}
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  supabase.storage.from_(bucket).upload(
78
  path=filename,
79
  file=payload,
80
  file_options={"content-type": "text/plain; charset=utf-8"}
81
  )
 
82
  return {"result": "uploaded"}
83
  except Exception as e:
 
84
  return {"result": "error", "error": str(e)}
 
19
 
20
 
21
  def _extract_prefix_and_match_pattern(filename: str):
22
+ """Extract prefix and pattern supporting _YYYYMMDD_HHMM or _YYYYMMDD_HHMMSS before extension.
23
+
24
+ Returns (prefix, regex_pattern)
25
+ """
26
+ # Strip extension
27
+ base, _, ext = filename.rpartition('.')
28
+ if not base:
29
+ base = filename
30
+ ext = ''
31
+
32
+ # Match last occurrence of timestamp suffix
33
+ m = re.match(r"^(.*)_(\d{8}_\d{4,6})$", base)
34
+ if m:
35
+ prefix = m.group(1)
36
+ else:
37
+ # No recognizable timestamp; treat entire base as prefix
38
+ prefix = base
39
+
40
+ # Build a pattern that accepts either HHMM or HHMMSS
41
+ ext_pattern = re.escape('.' + ext) if ext else r"\.txt"
42
+ pattern = rf"^{re.escape(prefix)}_\d{{8}}_\d{{4,6}}{ext_pattern}$"
43
  return prefix, pattern
44
 
45
 
 
49
  return None
50
 
51
  def ts_key(name: str):
52
+ # extract timestamp allowing HHMM or HHMMSS and normalize to HHMMSS for comparison
53
+ m = re.search(r"_(\d{8})_(\d{4,6})\.[^.]+$", name)
54
+ if not m:
55
+ return "00000000_000000"
56
+ date = m.group(1)
57
+ time = m.group(2)
58
+ if len(time) == 4:
59
+ time = time + "00"
60
+ return f"{date}_{time}"
61
 
62
  matched.sort(key=ts_key, reverse=True)
63
  return matched[0]
 
73
  return None
74
 
75
 
76
+ def _normalize_text(text: str) -> str:
77
+ """Normalize text for comparison by removing volatile timestamp lines and trimming whitespace.
78
+
79
+ - Removes lines starting with 'Diperbarui pada:' or 'Tanggal Akses:' (common dynamic timestamps)
80
+ - Strips trailing spaces on each line
81
+ - Collapses multiple blank lines into a single blank line
82
+ - Trims leading/trailing whitespace overall
83
+ """
84
+ # Remove BOM if present
85
+ if text and text.startswith("\ufeff"):
86
+ text = text.lstrip("\ufeff")
87
+
88
+ lines = []
89
+ for line in text.splitlines():
90
+ lstrip = line.lstrip()
91
+ if lstrip.startswith("Diperbarui pada:") or lstrip.startswith("Tanggal Akses:"):
92
+ continue
93
+ lines.append(line.rstrip())
94
+
95
+ # Collapse multiple blank lines
96
+ collapsed = []
97
+ last_blank = False
98
+ for ln in lines:
99
+ is_blank = (ln.strip() == "")
100
+ if is_blank and last_blank:
101
+ continue
102
+ collapsed.append(ln)
103
+ last_blank = is_blank
104
+
105
+ return "\n".join(collapsed).strip()
106
+
107
+
108
  def upload_if_changed(supabase, bucket: str, filename: str, content: Union[str, bytes]) -> Dict[str, str]:
109
  """Upload file only if content differs from latest existing file with the same prefix.
110
 
 
119
  payload = content.encode('utf-8')
120
 
121
  prefix, pattern = _extract_prefix_and_match_pattern(filename)
122
+ print(f"[DEDUP] Checking file: {filename}")
123
+ print(f"[DEDUP] Extracted prefix: '{prefix}', pattern: '{pattern}'")
124
+
125
  names = _list_names(supabase, bucket)
126
+ print(f"[DEDUP] Found {len(names)} total files in bucket")
127
+
128
  latest = _pick_latest_name(names, pattern)
129
  if latest:
130
+ print(f"[DEDUP] Latest existing file with same prefix: {latest}")
131
  old_text = _download_text(supabase, bucket, latest)
132
+ if old_text is not None:
133
+ old_normalized = _normalize_text(old_text)
134
+ new_normalized = _normalize_text(text)
135
+ print(f"[DEDUP] Old content length (normalized): {len(old_normalized)} chars")
136
+ print(f"[DEDUP] New content length (normalized): {len(new_normalized)} chars")
137
+
138
+ if old_normalized == new_normalized:
139
+ print(f"[DEDUP] ✅ Content identical - SKIPPING upload")
140
+ return {"result": "skipped"}
141
+ else:
142
+ print(f"[DEDUP] ❌ Content differs - PROCEEDING with upload")
143
+ else:
144
+ print(f"[DEDUP] ⚠️ Could not download existing file content - PROCEEDING with upload")
145
+ else:
146
+ print(f"[DEDUP] No existing file with same prefix found - PROCEEDING with upload")
147
 
148
  supabase.storage.from_(bucket).upload(
149
  path=filename,
150
  file=payload,
151
  file_options={"content-type": "text/plain; charset=utf-8"}
152
  )
153
+ print(f"[DEDUP] ✅ Successfully uploaded: {filename}")
154
  return {"result": "uploaded"}
155
  except Exception as e:
156
+ print(f"[DEDUP] ❌ Error during upload: {str(e)}")
157
  return {"result": "error", "error": str(e)}