TuringsSolutions commited on
Commit
ad69efd
Β·
verified Β·
1 Parent(s): afd2e5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -47
app.py CHANGED
@@ -1,11 +1,11 @@
1
- import io, math, json, gzip, textwrap
2
  import numpy as np
3
  import pandas as pd
4
  import gradio as gr
5
 
6
- from typing import Dict, Any
7
-
8
- # --- (Functions below are minimal clones to keep the Gradio app standalone) ---
9
  def shannon_entropy_from_counts(counts: np.ndarray) -> float:
10
  counts = counts.astype(float)
11
  total = counts.sum()
@@ -94,7 +94,6 @@ def kd_entropy(points: np.ndarray, max_leaf: int = 128, axis: int = 0) -> float:
94
  return 0.0
95
  if n <= max_leaf:
96
  return 0.0
97
- d = points.shape[1]
98
  vals = points[:, axis]
99
  med = np.median(vals)
100
  left = points[vals <= med]
@@ -105,7 +104,7 @@ def kd_entropy(points: np.ndarray, max_leaf: int = 128, axis: int = 0) -> float:
105
  for p in (pL, pR):
106
  if p > 0:
107
  H_here += -p * math.log(p, 2)
108
- next_axis = (axis + 1) % d
109
  return H_here + kd_entropy(left, max_leaf, next_axis) + kd_entropy(right, max_leaf, next_axis)
110
 
111
  def normalize(value: float, max_value: float) -> float:
@@ -114,7 +113,115 @@ def normalize(value: float, max_value: float) -> float:
114
  v = max(0.0, min(1.0, value / max_value))
115
  return float(v)
116
 
117
- def compute_metrics(df: pd.DataFrame):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  report = {}
119
  n_rows, n_cols = df.shape
120
  report["shape"] = {"rows": int(n_rows), "cols": int(n_cols)}
@@ -180,6 +287,7 @@ def compute_metrics(df: pd.DataFrame):
180
  report["pareto_maxima_2d"] = 0
181
  report["kd_partition_entropy_bits"] = 0.0
182
 
 
183
  max_bits = math.log2(max(2, n_rows))
184
  he_parts = []
185
  he_parts.append(1.0 - max(0.0, min(1.0, report["gzip_compression_ratio"])))
@@ -202,58 +310,101 @@ def compute_metrics(df: pd.DataFrame):
202
 
203
  return report
204
 
205
- def explain_report(report: Dict[str, Any]) -> str:
206
- lines = []
207
- r, c = report["shape"]["rows"], report["shape"]["cols"]
208
- lines.append(f"**Dataset shape:** {r} rows Γ— {c} columns.")
209
- g = report.get("gzip_compression_ratio", None)
210
- if g is not None:
211
- lines.append(f"**Global compressibility (gzip ratio):** {g:.3f}. Lower = more structure.")
212
- he = report.get("harvestable_energy_score", 0.0)
213
- he_pct = int(100 * he)
214
- lines.append(f"**Harvestable Energy (0–100):** ~{he_pct}. Higher = more exploitable order.")
215
- pm = report.get("pareto_maxima_2d", None)
216
- if pm is not None:
217
- lines.append(f"**2D Pareto maxima (first two numeric cols):** {pm}.")
218
- Hkd = report.get("kd_partition_entropy_bits", None)
219
- if Hkd is not None:
220
- lines.append(f"**Range-partition entropy (kd approx):** {Hkd:.3f} bits.")
221
- lines.append("\\n**Column-level:**")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  for c, st in report.get("per_column", {}).items():
223
- m = report["missing_fraction_per_column"].get(c, 0.0)
224
  if "entropy_binned_bits" in st:
225
- lines.append(f"- **{c}** (numeric): missing {m:.1%}, binned entropy {st['entropy_binned_bits']:.2f} bits, "
226
- f"{st['monotone_runs']} runs (run-entropy {st['run_entropy_bits']:.2f} bits), "
227
- f"sortedness {st['sortedness_fraction']:.2f}.")
228
  elif "entropy_bits" in st:
229
- lines.append(f"- **{c}** (categorical): missing {m:.1%}, entropy {st['entropy_bits']:.2f} bits, "
230
- f"{st['unique_values']} unique.")
231
  else:
232
- lines.append(f"- **{c}**: missing {m:.1%}.")
233
- lines.append("\\n**Tips:** Higher energy and lower entropies often allow near-linear algorithms (run-aware sorts, hull scans, envelope merges).")
234
- return "\\n".join(lines)
 
 
 
 
 
235
 
 
 
 
236
  def analyze(file):
237
  if file is None:
238
- return "Please upload a CSV.", ""
239
  try:
240
  df = pd.read_csv(file.name)
241
  except Exception as e:
242
- return f"Failed to read CSV: {e}", ""
 
243
  report = compute_metrics(df)
244
- md = explain_report(report)
245
- return json.dumps(report, indent=2), md
246
 
247
- with gr.Blocks(title="Dataset Energy & Entropy Analyzer") as demo:
248
- gr.Markdown("# Dataset Energy & Entropy Analyzer\nUpload a CSV to compute dataset structure metrics (entropy, runs, compressibility, kd-entropy) and an overall **Harvestable Energy** score.")
249
- with gr.Row():
250
- inp = gr.File(file_types=[".csv"], label="CSV file")
251
- with gr.Row():
252
- btn = gr.Button("Analyze", variant="primary")
 
 
 
 
253
  with gr.Row():
254
- json_out = gr.Code(label="Raw report (JSON)", language="json")
255
- md_out = gr.Markdown()
256
- btn.click(analyze, inputs=inp, outputs=[json_out, md_out])
 
 
 
 
 
 
 
 
 
 
257
 
258
- if __name__ == "__main__":
259
  demo.launch()
 
1
+ import io, math, json, gzip
2
  import numpy as np
3
  import pandas as pd
4
  import gradio as gr
5
 
6
+ # -------------------------------
7
+ # Core metric helpers
8
+ # -------------------------------
9
  def shannon_entropy_from_counts(counts: np.ndarray) -> float:
10
  counts = counts.astype(float)
11
  total = counts.sum()
 
94
  return 0.0
95
  if n <= max_leaf:
96
  return 0.0
 
97
  vals = points[:, axis]
98
  med = np.median(vals)
99
  left = points[vals <= med]
 
104
  for p in (pL, pR):
105
  if p > 0:
106
  H_here += -p * math.log(p, 2)
107
+ next_axis = (axis + 1) % points.shape[1]
108
  return H_here + kd_entropy(left, max_leaf, next_axis) + kd_entropy(right, max_leaf, next_axis)
109
 
110
  def normalize(value: float, max_value: float) -> float:
 
113
  v = max(0.0, min(1.0, value / max_value))
114
  return float(v)
115
 
116
+ # -------------------------------
117
+ # Scoring + interpretations
118
+ # -------------------------------
119
+ def grade_band(value: float, thresholds: list, labels: list):
120
+ """Generic banding helper: thresholds ascending; returns (label_idx, label)."""
121
+ for i, t in enumerate(thresholds):
122
+ if value <= t:
123
+ return i, labels[i]
124
+ return len(labels)-1, labels[-1]
125
+
126
+ def interpret_report(report: dict) -> dict:
127
+ """Produce human-friendly interpretations with color badges and advice."""
128
+ r, c = report["shape"]["rows"], report["shape"]["cols"]
129
+ max_bits = math.log2(max(2, r))
130
+
131
+ # Harvestable Energy (0..1)
132
+ he = report.get("harvestable_energy_score", 0.0)
133
+ he_pct = round(100 * he)
134
+ he_idx, he_label = grade_band(1.0 - he, [0.15, 0.35, 0.6, 0.85], # invert so higher is better
135
+ ["Excellent", "High", "Moderate", "Low", "Very Low"])
136
+ he_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][he_idx]
137
+
138
+ # Gzip ratio (lower is better)
139
+ gz = report.get("gzip_compression_ratio", 1.0)
140
+ gz_idx, gz_label = grade_band(gz, [0.45, 0.7, 0.9, 1.1], ["Highly compressible", "Compressible", "Some structure", "Low structure", "Unstructured"])
141
+ gz_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][gz_idx]
142
+
143
+ # kd-entropy (lower is better). Normalize by log2(n)
144
+ Hkd = float(report.get("kd_partition_entropy_bits", 0.0))
145
+ Hkd_norm = normalize(Hkd, max_bits)
146
+ kd_idx, kd_label = grade_band(Hkd_norm, [0.15, 0.3, 0.5, 0.75], ["Simple spatial blocks", "Moderately simple", "Mixed", "Complex", "Highly complex"])
147
+ kd_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][kd_idx]
148
+
149
+ # Run-entropy / Sortedness aggregation for numeric columns
150
+ per_col = report.get("per_column", {})
151
+ run_H = []
152
+ sorted_fracs = []
153
+ for col, st in per_col.items():
154
+ if "run_entropy_bits" in st:
155
+ run_H.append(st["run_entropy_bits"])
156
+ sorted_fracs.append(st.get("sortedness_fraction", 0.0))
157
+ if run_H:
158
+ runH_mean = float(np.mean(run_H))
159
+ runH_norm = normalize(runH_mean, max_bits)
160
+ sort_mean = float(np.mean(sorted_fracs)) if sorted_fracs else 0.0
161
+ else:
162
+ runH_norm = 1.0
163
+ sort_mean = 0.0
164
+
165
+ run_idx, run_label = grade_band(runH_norm, [0.15, 0.3, 0.5, 0.75], ["Long smooth runs", "Mostly smooth", "Mixed runs", "Choppy", "Highly choppy"])
166
+ run_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][run_idx]
167
+
168
+ sort_idx, sort_label = grade_band(1.0 - sort_mean, [0.15, 0.3, 0.5, 0.75], ["Highly sorted", "Mostly sorted", "Partially sorted", "Barely sorted", "Unsorted"])
169
+ sort_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][sort_idx]
170
+
171
+ # Duplicate rows
172
+ dup = report.get("duplicate_row_fraction", 0.0)
173
+ dup_idx, dup_label = grade_band(dup, [0.01, 0.05, 0.15, 0.3], ["Clean", "Light dups", "Moderate dups", "High dups", "Very high dups"])
174
+ dup_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][dup_idx]
175
+
176
+ # Recommendations (simple rule-based)
177
+ recs = []
178
+ if he >= 0.7:
179
+ recs.append("Leverage **adaptive algorithms** (TimSort-style merges, linear hull/skyline passes) for near-linear performance.")
180
+ elif he >= 0.4:
181
+ recs.append("Consider **light preprocessing** (bucketing, dedupe) to unlock more adaptive speedups.")
182
+ else:
183
+ recs.append("Expect **near worst-case costs**; use robust algorithms and consider feature engineering/cleaning.")
184
+
185
+ if gz <= 0.7:
186
+ recs.append("Data is **highly compressible** β†’ try dictionary/columnar encoding and caching to cut memory/IO.")
187
+ elif gz >= 1.0:
188
+ recs.append("Data is **hard to compress** β†’ prioritize dimensionality reduction or noise filtering.")
189
+
190
+ if runH_norm <= 0.3 or sort_mean >= 0.7:
191
+ recs.append("Columns show **long monotone runs** β†’ merges and single-pass scans will be efficient.")
192
+ else:
193
+ recs.append("Columns are **choppy** β†’ batch/aggregate before sorting to reduce comparisons.")
194
+
195
+ if Hkd_norm <= 0.3:
196
+ recs.append("Spatial structure is **simple** β†’ kd/quad trees will be shallow; range queries will be fast.")
197
+ elif Hkd_norm >= 0.6:
198
+ recs.append("Spatial structure is **complex** β†’ consider clustering/tiling before building indexes.")
199
+
200
+ if dup >= 0.05:
201
+ recs.append("De-duplicate rows to lower entropy and improve compression & joins.")
202
+
203
+ # Summary verdict
204
+ verdict = ["Outstanding structure for fast algorithms.",
205
+ "Strong latent order; plenty of speed to harvest.",
206
+ "Mixed: some order present; moderate gains possible.",
207
+ "Low order; focus on cleaning and feature engineering.",
208
+ "Chaotic: assume worst-case runtimes."][he_idx]
209
+
210
+ return {
211
+ "he": {"pct": he_pct, "label": he_label, "color": he_color},
212
+ "gzip": {"value": gz, "label": gz_label, "color": gz_color},
213
+ "kd": {"value": Hkd, "label": kd_label, "color": kd_color},
214
+ "runs": {"value": runH_norm, "label": run_label, "color": run_color},
215
+ "sorted": {"value": sort_mean, "label": sort_label, "color": sort_color},
216
+ "dup": {"value": dup, "label": dup_label, "color": dup_color},
217
+ "verdict": verdict,
218
+ "recs": recs[:6]
219
+ }
220
+
221
+ # -------------------------------
222
+ # Compute metrics
223
+ # -------------------------------
224
+ def compute_metrics(df: pd.DataFrame) -> dict:
225
  report = {}
226
  n_rows, n_cols = df.shape
227
  report["shape"] = {"rows": int(n_rows), "cols": int(n_cols)}
 
287
  report["pareto_maxima_2d"] = 0
288
  report["kd_partition_entropy_bits"] = 0.0
289
 
290
+ # Harvestable Energy
291
  max_bits = math.log2(max(2, n_rows))
292
  he_parts = []
293
  he_parts.append(1.0 - max(0.0, min(1.0, report["gzip_compression_ratio"])))
 
310
 
311
  return report
312
 
313
+ # -------------------------------
314
+ # UI rendering helpers
315
+ # -------------------------------
316
+ def badge(text: str, color: str) -> str:
317
+ return f"<span style='background:{color};color:white;padding:6px 10px;border-radius:999px;font-weight:600'>{text}</span>"
318
+
319
+ def metric_card(title: str, value: str, badge_html: str) -> str:
320
+ return f"""
321
+ <div style="flex:1;min-width:220px;border:1px solid #e5e7eb;border-radius:14px;padding:14px 16px;">
322
+ <div style="font-size:14px;color:#6b7280;margin-bottom:8px">{title}</div>
323
+ <div style="font-size:22px;font-weight:700;margin-bottom:10px">{value}</div>
324
+ {badge_html}
325
+ </div>
326
+ """
327
+
328
+ def render_dashboard(report: dict, interp: dict) -> str:
329
+ he = interp["he"]
330
+ gz = interp["gzip"]
331
+ kd = interp["kd"]
332
+ runs = interp["runs"]
333
+ sortb = interp["sorted"]
334
+ dup = interp["dup"]
335
+
336
+ cards = []
337
+ cards.append(metric_card("Harvestable Energy", f"{he['pct']} / 100", badge(he['label'], he['color'])))
338
+ cards.append(metric_card("Compressibility (gzip)", f"{gz['value']:.3f}", badge(gz['label'], gz['color'])))
339
+ cards.append(metric_card("Range-Partition Entropy (kd bits)", f"{kd['value']:.3f}", badge(kd['label'], kd['color'])))
340
+ cards.append(metric_card("Run-Entropy (avg, normalized)", f"{runs['value']:.2f}", badge(runs['label'], runs['color'])))
341
+ cards.append(metric_card("Sortedness (avg fraction)", f"{sortb['value']:.2f}", badge(sortb['label'], sortb['color'])))
342
+ cards.append(metric_card("Duplicate Rows (fraction)", f"{dup['value']:.2f}", badge(dup['label'], dup['color'])))
343
+
344
+ grid = "<div style='display:flex;flex-wrap:wrap;gap:12px'>" + "".join(cards) + "</div>"
345
+ verdict = f"<div style='margin-top:12px;padding:14px 16px;background:#f9fafb;border:1px solid #e5e7eb;border-radius:14px'><b>Verdict:</b> {interp['verdict']}</div>"
346
+ return grid + verdict
347
+
348
+ def render_recs(interp: dict) -> str:
349
+ lis = "".join([f"<li>{r}</li>" for r in interp["recs"]])
350
+ return f"<ul>{lis}</ul>"
351
+
352
+ def render_columns(report: dict) -> str:
353
+ rows = []
354
  for c, st in report.get("per_column", {}).items():
355
+ miss = report["missing_fraction_per_column"].get(c, 0.0)
356
  if "entropy_binned_bits" in st:
357
+ rows.append(f"<tr><td><b>{c}</b> (num)</td><td>{miss:.1%}</td><td>{st['entropy_binned_bits']:.2f}</td><td>{st['monotone_runs']}</td><td>{st['run_entropy_bits']:.2f}</td><td>{st['sortedness_fraction']:.2f}</td></tr>")
 
 
358
  elif "entropy_bits" in st:
359
+ rows.append(f"<tr><td><b>{c}</b> (cat)</td><td>{miss:.1%}</td><td>{st['entropy_bits']:.2f}</td><td>-</td><td>-</td><td>-</td></tr>")
 
360
  else:
361
+ rows.append(f"<tr><td><b>{c}</b></td><td>{miss:.1%}</td><td>-</td><td>-</td><td>-</td><td>-</td></tr>")
362
+ header = "<tr><th>Column</th><th>Missing</th><th>Entropy</th><th>Monotone Runs</th><th>Run-Entropy</th><th>Sortedness</th></tr>"
363
+ table = "<table style='width:100%;border-collapse:collapse'>"+header+"".join(rows)+"</table>"
364
+ # simple row borders
365
+ table = table.replace("<tr>", "<tr style='border-bottom:1px solid #e5e7eb'>")
366
+ table = table.replace("<th>", "<th style='text-align:left;padding:8px 6px;color:#374151'>")
367
+ table = table.replace("<td>", "<td style='padding:8px 6px;color:#111827'>")
368
+ return table
369
 
370
+ # -------------------------------
371
+ # Gradio app
372
+ # -------------------------------
373
  def analyze(file):
374
  if file is None:
375
+ return "{}", "Please upload a CSV.", "", ""
376
  try:
377
  df = pd.read_csv(file.name)
378
  except Exception as e:
379
+ return "{}", f"Failed to read CSV: {e}", "", ""
380
+
381
  report = compute_metrics(df)
382
+ interp = interpret_report(report)
 
383
 
384
+ report_json = json.dumps(report, indent=2)
385
+ dashboard_html = render_dashboard(report, interp)
386
+ recs_html = render_recs(interp)
387
+ cols_html = render_columns(report)
388
+
389
+ return report_json, dashboard_html, recs_html, cols_html
390
+
391
+ with gr.Blocks(title="OrderLens β€” Data Interpreter") as demo:
392
+ gr.Markdown("# OrderLens β€” Data Interpreter")
393
+ gr.Markdown("Upload a CSV and get **readable** structure metrics with plain-language guidance.")
394
  with gr.Row():
395
+ inp = gr.File(file_types=[\".csv\"], label=\"CSV file\")
396
+ btn = gr.Button(\"Analyze\", variant=\"primary\")
397
+ gr.Markdown(\"---\")
398
+ gr.Markdown(\"### Dashboard\") # color-coded cards + verdict
399
+ dash = gr.HTML()
400
+ gr.Markdown(\"### Recommendations\") # actionable tips
401
+ recs = gr.HTML()
402
+ gr.Markdown(\"### Column Details\") # per-column table
403
+ cols = gr.HTML()
404
+ gr.Markdown(\"### Raw report (JSON)\") # API-friendly
405
+ json_out = gr.Code(label=\"Report\", language=\"json\")
406
+
407
+ btn.click(analyze, inputs=inp, outputs=[json_out, dash, recs, cols])
408
 
409
+ if __name__ == \"__main__\":
410
  demo.launch()