Spaces:
Sleeping
Sleeping
FauziIsyrinApridal
commited on
Commit
·
174c308
1
Parent(s):
7add442
revisi 12
Browse files
scrapping/utils/supabase_utils.py
CHANGED
@@ -19,14 +19,27 @@ def _list_names(supabase, bucket: str) -> List[str]:
|
|
19 |
|
20 |
|
21 |
def _extract_prefix_and_match_pattern(filename: str):
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
return prefix, pattern
|
31 |
|
32 |
|
@@ -36,8 +49,15 @@ def _pick_latest_name(names: List[str], pattern: str) -> Optional[str]:
|
|
36 |
return None
|
37 |
|
38 |
def ts_key(name: str):
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
matched.sort(key=ts_key, reverse=True)
|
43 |
return matched[0]
|
@@ -53,6 +73,38 @@ def _download_text(supabase, bucket: str, name: str) -> Optional[str]:
|
|
53 |
return None
|
54 |
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
def upload_if_changed(supabase, bucket: str, filename: str, content: Union[str, bytes]) -> Dict[str, str]:
|
57 |
"""Upload file only if content differs from latest existing file with the same prefix.
|
58 |
|
@@ -67,18 +119,39 @@ def upload_if_changed(supabase, bucket: str, filename: str, content: Union[str,
|
|
67 |
payload = content.encode('utf-8')
|
68 |
|
69 |
prefix, pattern = _extract_prefix_and_match_pattern(filename)
|
|
|
|
|
|
|
70 |
names = _list_names(supabase, bucket)
|
|
|
|
|
71 |
latest = _pick_latest_name(names, pattern)
|
72 |
if latest:
|
|
|
73 |
old_text = _download_text(supabase, bucket, latest)
|
74 |
-
if old_text is not None
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
supabase.storage.from_(bucket).upload(
|
78 |
path=filename,
|
79 |
file=payload,
|
80 |
file_options={"content-type": "text/plain; charset=utf-8"}
|
81 |
)
|
|
|
82 |
return {"result": "uploaded"}
|
83 |
except Exception as e:
|
|
|
84 |
return {"result": "error", "error": str(e)}
|
|
|
19 |
|
20 |
|
21 |
def _extract_prefix_and_match_pattern(filename: str):
|
22 |
+
"""Extract prefix and pattern supporting _YYYYMMDD_HHMM or _YYYYMMDD_HHMMSS before extension.
|
23 |
+
|
24 |
+
Returns (prefix, regex_pattern)
|
25 |
+
"""
|
26 |
+
# Strip extension
|
27 |
+
base, _, ext = filename.rpartition('.')
|
28 |
+
if not base:
|
29 |
+
base = filename
|
30 |
+
ext = ''
|
31 |
+
|
32 |
+
# Match last occurrence of timestamp suffix
|
33 |
+
m = re.match(r"^(.*)_(\d{8}_\d{4,6})$", base)
|
34 |
+
if m:
|
35 |
+
prefix = m.group(1)
|
36 |
+
else:
|
37 |
+
# No recognizable timestamp; treat entire base as prefix
|
38 |
+
prefix = base
|
39 |
+
|
40 |
+
# Build a pattern that accepts either HHMM or HHMMSS
|
41 |
+
ext_pattern = re.escape('.' + ext) if ext else r"\.txt"
|
42 |
+
pattern = rf"^{re.escape(prefix)}_\d{{8}}_\d{{4,6}}{ext_pattern}$"
|
43 |
return prefix, pattern
|
44 |
|
45 |
|
|
|
49 |
return None
|
50 |
|
51 |
def ts_key(name: str):
|
52 |
+
# extract timestamp allowing HHMM or HHMMSS and normalize to HHMMSS for comparison
|
53 |
+
m = re.search(r"_(\d{8})_(\d{4,6})\.[^.]+$", name)
|
54 |
+
if not m:
|
55 |
+
return "00000000_000000"
|
56 |
+
date = m.group(1)
|
57 |
+
time = m.group(2)
|
58 |
+
if len(time) == 4:
|
59 |
+
time = time + "00"
|
60 |
+
return f"{date}_{time}"
|
61 |
|
62 |
matched.sort(key=ts_key, reverse=True)
|
63 |
return matched[0]
|
|
|
73 |
return None
|
74 |
|
75 |
|
76 |
+
def _normalize_text(text: str) -> str:
|
77 |
+
"""Normalize text for comparison by removing volatile timestamp lines and trimming whitespace.
|
78 |
+
|
79 |
+
- Removes lines starting with 'Diperbarui pada:' or 'Tanggal Akses:' (common dynamic timestamps)
|
80 |
+
- Strips trailing spaces on each line
|
81 |
+
- Collapses multiple blank lines into a single blank line
|
82 |
+
- Trims leading/trailing whitespace overall
|
83 |
+
"""
|
84 |
+
# Remove BOM if present
|
85 |
+
if text and text.startswith("\ufeff"):
|
86 |
+
text = text.lstrip("\ufeff")
|
87 |
+
|
88 |
+
lines = []
|
89 |
+
for line in text.splitlines():
|
90 |
+
lstrip = line.lstrip()
|
91 |
+
if lstrip.startswith("Diperbarui pada:") or lstrip.startswith("Tanggal Akses:"):
|
92 |
+
continue
|
93 |
+
lines.append(line.rstrip())
|
94 |
+
|
95 |
+
# Collapse multiple blank lines
|
96 |
+
collapsed = []
|
97 |
+
last_blank = False
|
98 |
+
for ln in lines:
|
99 |
+
is_blank = (ln.strip() == "")
|
100 |
+
if is_blank and last_blank:
|
101 |
+
continue
|
102 |
+
collapsed.append(ln)
|
103 |
+
last_blank = is_blank
|
104 |
+
|
105 |
+
return "\n".join(collapsed).strip()
|
106 |
+
|
107 |
+
|
108 |
def upload_if_changed(supabase, bucket: str, filename: str, content: Union[str, bytes]) -> Dict[str, str]:
|
109 |
"""Upload file only if content differs from latest existing file with the same prefix.
|
110 |
|
|
|
119 |
payload = content.encode('utf-8')
|
120 |
|
121 |
prefix, pattern = _extract_prefix_and_match_pattern(filename)
|
122 |
+
print(f"[DEDUP] Checking file: {filename}")
|
123 |
+
print(f"[DEDUP] Extracted prefix: '{prefix}', pattern: '{pattern}'")
|
124 |
+
|
125 |
names = _list_names(supabase, bucket)
|
126 |
+
print(f"[DEDUP] Found {len(names)} total files in bucket")
|
127 |
+
|
128 |
latest = _pick_latest_name(names, pattern)
|
129 |
if latest:
|
130 |
+
print(f"[DEDUP] Latest existing file with same prefix: {latest}")
|
131 |
old_text = _download_text(supabase, bucket, latest)
|
132 |
+
if old_text is not None:
|
133 |
+
old_normalized = _normalize_text(old_text)
|
134 |
+
new_normalized = _normalize_text(text)
|
135 |
+
print(f"[DEDUP] Old content length (normalized): {len(old_normalized)} chars")
|
136 |
+
print(f"[DEDUP] New content length (normalized): {len(new_normalized)} chars")
|
137 |
+
|
138 |
+
if old_normalized == new_normalized:
|
139 |
+
print(f"[DEDUP] ✅ Content identical - SKIPPING upload")
|
140 |
+
return {"result": "skipped"}
|
141 |
+
else:
|
142 |
+
print(f"[DEDUP] ❌ Content differs - PROCEEDING with upload")
|
143 |
+
else:
|
144 |
+
print(f"[DEDUP] ⚠️ Could not download existing file content - PROCEEDING with upload")
|
145 |
+
else:
|
146 |
+
print(f"[DEDUP] No existing file with same prefix found - PROCEEDING with upload")
|
147 |
|
148 |
supabase.storage.from_(bucket).upload(
|
149 |
path=filename,
|
150 |
file=payload,
|
151 |
file_options={"content-type": "text/plain; charset=utf-8"}
|
152 |
)
|
153 |
+
print(f"[DEDUP] ✅ Successfully uploaded: {filename}")
|
154 |
return {"result": "uploaded"}
|
155 |
except Exception as e:
|
156 |
+
print(f"[DEDUP] ❌ Error during upload: {str(e)}")
|
157 |
return {"result": "error", "error": str(e)}
|