Spaces:

TuringsSolutions
/

Entropy-Harvester

Sleeping

App Files Files Community

TuringsSolutions commited on Sep 2

Commit

ad69efd

verified ·

1 Parent(s): afd2e5c

Update app.py

Browse files

Files changed (1) hide show

app.py +198 -47

app.py CHANGED Viewed

@@ -1,11 +1,11 @@
-import io, math, json, gzip, textwrap
 import numpy as np
 import pandas as pd
 import gradio as gr
-from typing import Dict, Any
-# --- (Functions below are minimal clones to keep the Gradio app standalone) ---
 def shannon_entropy_from_counts(counts: np.ndarray) -> float:
     counts = counts.astype(float)
     total = counts.sum()
@@ -94,7 +94,6 @@ def kd_entropy(points: np.ndarray, max_leaf: int = 128, axis: int = 0) -> float:
         return 0.0
     if n <= max_leaf:
         return 0.0
-    d = points.shape[1]
     vals = points[:, axis]
     med = np.median(vals)
     left = points[vals <= med]
@@ -105,7 +104,7 @@ def kd_entropy(points: np.ndarray, max_leaf: int = 128, axis: int = 0) -> float:
     for p in (pL, pR):
         if p > 0:
             H_here += -p * math.log(p, 2)
-    next_axis = (axis + 1) % d
     return H_here + kd_entropy(left, max_leaf, next_axis) + kd_entropy(right, max_leaf, next_axis)
 def normalize(value: float, max_value: float) -> float:
@@ -114,7 +113,115 @@ def normalize(value: float, max_value: float) -> float:
     v = max(0.0, min(1.0, value / max_value))
     return float(v)
-def compute_metrics(df: pd.DataFrame):
     report = {}
     n_rows, n_cols = df.shape
     report["shape"] = {"rows": int(n_rows), "cols": int(n_cols)}
@@ -180,6 +287,7 @@ def compute_metrics(df: pd.DataFrame):
         report["pareto_maxima_2d"] = 0
         report["kd_partition_entropy_bits"] = 0.0
     max_bits = math.log2(max(2, n_rows))
     he_parts = []
     he_parts.append(1.0 - max(0.0, min(1.0, report["gzip_compression_ratio"])))
@@ -202,58 +310,101 @@ def compute_metrics(df: pd.DataFrame):
     return report
-def explain_report(report: Dict[str, Any]) -> str:
-    lines = []
-    r, c = report["shape"]["rows"], report["shape"]["cols"]
-    lines.append(f"**Dataset shape:** {r} rows × {c} columns.")
-    g = report.get("gzip_compression_ratio", None)
-    if g is not None:
-        lines.append(f"**Global compressibility (gzip ratio):** {g:.3f}. Lower = more structure.")
-    he = report.get("harvestable_energy_score", 0.0)
-    he_pct = int(100 * he)
-    lines.append(f"**Harvestable Energy (0–100):** ~{he_pct}. Higher = more exploitable order.")
-    pm = report.get("pareto_maxima_2d", None)
-    if pm is not None:
-        lines.append(f"**2D Pareto maxima (first two numeric cols):** {pm}.")
-    Hkd = report.get("kd_partition_entropy_bits", None)
-    if Hkd is not None:
-        lines.append(f"**Range-partition entropy (kd approx):** {Hkd:.3f} bits.")
-    lines.append("\\n**Column-level:**")
     for c, st in report.get("per_column", {}).items():
-        m = report["missing_fraction_per_column"].get(c, 0.0)
         if "entropy_binned_bits" in st:
-            lines.append(f"- **{c}** (numeric): missing {m:.1%}, binned entropy {st['entropy_binned_bits']:.2f} bits, "
-                         f"{st['monotone_runs']} runs (run-entropy {st['run_entropy_bits']:.2f} bits), "
-                         f"sortedness {st['sortedness_fraction']:.2f}.")
         elif "entropy_bits" in st:
-            lines.append(f"- **{c}** (categorical): missing {m:.1%}, entropy {st['entropy_bits']:.2f} bits, "
-                         f"{st['unique_values']} unique.")
         else:
-            lines.append(f"- **{c}**: missing {m:.1%}.")
-    lines.append("\\n**Tips:** Higher energy and lower entropies often allow near-linear algorithms (run-aware sorts, hull scans, envelope merges).")
-    return "\\n".join(lines)
 def analyze(file):
     if file is None:
-        return "Please upload a CSV.", ""
     try:
         df = pd.read_csv(file.name)
     except Exception as e:
-        return f"Failed to read CSV: {e}", ""
     report = compute_metrics(df)
-    md = explain_report(report)
-    return json.dumps(report, indent=2), md
-with gr.Blocks(title="Dataset Energy & Entropy Analyzer") as demo:
-    gr.Markdown("# Dataset Energy & Entropy Analyzer\nUpload a CSV to compute dataset structure metrics (entropy, runs, compressibility, kd-entropy) and an overall **Harvestable Energy** score.")
-    with gr.Row():
-        inp = gr.File(file_types=[".csv"], label="CSV file")
-    with gr.Row():
-        btn = gr.Button("Analyze", variant="primary")
     with gr.Row():
-        json_out = gr.Code(label="Raw report (JSON)", language="json")
-    md_out = gr.Markdown()
-    btn.click(analyze, inputs=inp, outputs=[json_out, md_out])
-if __name__ == "__main__":
     demo.launch()

+import io, math, json, gzip
 import numpy as np
 import pandas as pd
 import gradio as gr
+# -------------------------------
+# Core metric helpers
+# -------------------------------
 def shannon_entropy_from_counts(counts: np.ndarray) -> float:
     counts = counts.astype(float)
     total = counts.sum()
         return 0.0
     if n <= max_leaf:
         return 0.0
     vals = points[:, axis]
     med = np.median(vals)
     left = points[vals <= med]
     for p in (pL, pR):
         if p > 0:
             H_here += -p * math.log(p, 2)
+    next_axis = (axis + 1) % points.shape[1]
     return H_here + kd_entropy(left, max_leaf, next_axis) + kd_entropy(right, max_leaf, next_axis)
 def normalize(value: float, max_value: float) -> float:
     v = max(0.0, min(1.0, value / max_value))
     return float(v)
+# -------------------------------
+# Scoring + interpretations
+# -------------------------------
+def grade_band(value: float, thresholds: list, labels: list):
+    """Generic banding helper: thresholds ascending; returns (label_idx, label)."""
+    for i, t in enumerate(thresholds):
+        if value <= t:
+            return i, labels[i]
+    return len(labels)-1, labels[-1]
+def interpret_report(report: dict) -> dict:
+    """Produce human-friendly interpretations with color badges and advice."""
+    r, c = report["shape"]["rows"], report["shape"]["cols"]
+    max_bits = math.log2(max(2, r))
+    # Harvestable Energy (0..1)
+    he = report.get("harvestable_energy_score", 0.0)
+    he_pct = round(100 * he)
+    he_idx, he_label = grade_band(1.0 - he, [0.15, 0.35, 0.6, 0.85],  # invert so higher is better
+                                  ["Excellent", "High", "Moderate", "Low", "Very Low"])
+    he_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][he_idx]
+    # Gzip ratio (lower is better)
+    gz = report.get("gzip_compression_ratio", 1.0)
+    gz_idx, gz_label = grade_band(gz, [0.45, 0.7, 0.9, 1.1], ["Highly compressible", "Compressible", "Some structure", "Low structure", "Unstructured"])
+    gz_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][gz_idx]
+    # kd-entropy (lower is better). Normalize by log2(n)
+    Hkd = float(report.get("kd_partition_entropy_bits", 0.0))
+    Hkd_norm = normalize(Hkd, max_bits)
+    kd_idx, kd_label = grade_band(Hkd_norm, [0.15, 0.3, 0.5, 0.75], ["Simple spatial blocks", "Moderately simple", "Mixed", "Complex", "Highly complex"])
+    kd_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][kd_idx]
+    # Run-entropy / Sortedness aggregation for numeric columns
+    per_col = report.get("per_column", {})
+    run_H = []
+    sorted_fracs = []
+    for col, st in per_col.items():
+        if "run_entropy_bits" in st:
+            run_H.append(st["run_entropy_bits"])
+            sorted_fracs.append(st.get("sortedness_fraction", 0.0))
+    if run_H:
+        runH_mean = float(np.mean(run_H))
+        runH_norm = normalize(runH_mean, max_bits)
+        sort_mean = float(np.mean(sorted_fracs)) if sorted_fracs else 0.0
+    else:
+        runH_norm = 1.0
+        sort_mean = 0.0
+    run_idx, run_label = grade_band(runH_norm, [0.15, 0.3, 0.5, 0.75], ["Long smooth runs", "Mostly smooth", "Mixed runs", "Choppy", "Highly choppy"])
+    run_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][run_idx]
+    sort_idx, sort_label = grade_band(1.0 - sort_mean, [0.15, 0.3, 0.5, 0.75], ["Highly sorted", "Mostly sorted", "Partially sorted", "Barely sorted", "Unsorted"])
+    sort_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][sort_idx]
+    # Duplicate rows
+    dup = report.get("duplicate_row_fraction", 0.0)
+    dup_idx, dup_label = grade_band(dup, [0.01, 0.05, 0.15, 0.3], ["Clean", "Light dups", "Moderate dups", "High dups", "Very high dups"])
+    dup_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][dup_idx]
+    # Recommendations (simple rule-based)
+    recs = []
+    if he >= 0.7:
+        recs.append("Leverage **adaptive algorithms** (TimSort-style merges, linear hull/skyline passes) for near-linear performance.")
+    elif he >= 0.4:
+        recs.append("Consider **light preprocessing** (bucketing, dedupe) to unlock more adaptive speedups.")
+    else:
+        recs.append("Expect **near worst-case costs**; use robust algorithms and consider feature engineering/cleaning.")
+    if gz <= 0.7:
+        recs.append("Data is **highly compressible** → try dictionary/columnar encoding and caching to cut memory/IO.")
+    elif gz >= 1.0:
+        recs.append("Data is **hard to compress** → prioritize dimensionality reduction or noise filtering.")
+    if runH_norm <= 0.3 or sort_mean >= 0.7:
+        recs.append("Columns show **long monotone runs** → merges and single-pass scans will be efficient.")
+    else:
+        recs.append("Columns are **choppy** → batch/aggregate before sorting to reduce comparisons.")
+    if Hkd_norm <= 0.3:
+        recs.append("Spatial structure is **simple** → kd/quad trees will be shallow; range queries will be fast.")
+    elif Hkd_norm >= 0.6:
+        recs.append("Spatial structure is **complex** → consider clustering/tiling before building indexes.")
+    if dup >= 0.05:
+        recs.append("De-duplicate rows to lower entropy and improve compression & joins.")
+    # Summary verdict
+    verdict = ["Outstanding structure for fast algorithms.",
+               "Strong latent order; plenty of speed to harvest.",
+               "Mixed: some order present; moderate gains possible.",
+               "Low order; focus on cleaning and feature engineering.",
+               "Chaotic: assume worst-case runtimes."][he_idx]
+    return {
+        "he": {"pct": he_pct, "label": he_label, "color": he_color},
+        "gzip": {"value": gz, "label": gz_label, "color": gz_color},
+        "kd": {"value": Hkd, "label": kd_label, "color": kd_color},
+        "runs": {"value": runH_norm, "label": run_label, "color": run_color},
+        "sorted": {"value": sort_mean, "label": sort_label, "color": sort_color},
+        "dup": {"value": dup, "label": dup_label, "color": dup_color},
+        "verdict": verdict,
+        "recs": recs[:6]
+    }
+# -------------------------------
+# Compute metrics
+# -------------------------------
+def compute_metrics(df: pd.DataFrame) -> dict:
     report = {}
     n_rows, n_cols = df.shape
     report["shape"] = {"rows": int(n_rows), "cols": int(n_cols)}
         report["pareto_maxima_2d"] = 0
         report["kd_partition_entropy_bits"] = 0.0
+    # Harvestable Energy
     max_bits = math.log2(max(2, n_rows))
     he_parts = []
     he_parts.append(1.0 - max(0.0, min(1.0, report["gzip_compression_ratio"])))
     return report
+# -------------------------------
+# UI rendering helpers
+# -------------------------------
+def badge(text: str, color: str) -> str:
+    return f"<span style='background:{color};color:white;padding:6px 10px;border-radius:999px;font-weight:600'>{text}</span>"
+def metric_card(title: str, value: str, badge_html: str) -> str:
+    return f"""
+    <div style="flex:1;min-width:220px;border:1px solid #e5e7eb;border-radius:14px;padding:14px 16px;">
+        <div style="font-size:14px;color:#6b7280;margin-bottom:8px">{title}</div>
+        <div style="font-size:22px;font-weight:700;margin-bottom:10px">{value}</div>
+        {badge_html}
+    </div>
+    """
+def render_dashboard(report: dict, interp: dict) -> str:
+    he = interp["he"]
+    gz = interp["gzip"]
+    kd = interp["kd"]
+    runs = interp["runs"]
+    sortb = interp["sorted"]
+    dup = interp["dup"]
+    cards = []
+    cards.append(metric_card("Harvestable Energy", f"{he['pct']} / 100", badge(he['label'], he['color'])))
+    cards.append(metric_card("Compressibility (gzip)", f"{gz['value']:.3f}", badge(gz['label'], gz['color'])))
+    cards.append(metric_card("Range-Partition Entropy (kd bits)", f"{kd['value']:.3f}", badge(kd['label'], kd['color'])))
+    cards.append(metric_card("Run-Entropy (avg, normalized)", f"{runs['value']:.2f}", badge(runs['label'], runs['color'])))
+    cards.append(metric_card("Sortedness (avg fraction)", f"{sortb['value']:.2f}", badge(sortb['label'], sortb['color'])))
+    cards.append(metric_card("Duplicate Rows (fraction)", f"{dup['value']:.2f}", badge(dup['label'], dup['color'])))
+    grid = "<div style='display:flex;flex-wrap:wrap;gap:12px'>" + "".join(cards) + "</div>"
+    verdict = f"<div style='margin-top:12px;padding:14px 16px;background:#f9fafb;border:1px solid #e5e7eb;border-radius:14px'><b>Verdict:</b> {interp['verdict']}</div>"
+    return grid + verdict
+def render_recs(interp: dict) -> str:
+    lis = "".join([f"<li>{r}</li>" for r in interp["recs"]])
+    return f"<ul>{lis}</ul>"
+def render_columns(report: dict) -> str:
+    rows = []
     for c, st in report.get("per_column", {}).items():
+        miss = report["missing_fraction_per_column"].get(c, 0.0)
         if "entropy_binned_bits" in st:
+            rows.append(f"<tr><td><b>{c}</b> (num)</td><td>{miss:.1%}</td><td>{st['entropy_binned_bits']:.2f}</td><td>{st['monotone_runs']}</td><td>{st['run_entropy_bits']:.2f}</td><td>{st['sortedness_fraction']:.2f}</td></tr>")
         elif "entropy_bits" in st:
+            rows.append(f"<tr><td><b>{c}</b> (cat)</td><td>{miss:.1%}</td><td>{st['entropy_bits']:.2f}</td><td>-</td><td>-</td><td>-</td></tr>")
         else:
+            rows.append(f"<tr><td><b>{c}</b></td><td>{miss:.1%}</td><td>-</td><td>-</td><td>-</td><td>-</td></tr>")
+    header = "<tr><th>Column</th><th>Missing</th><th>Entropy</th><th>Monotone Runs</th><th>Run-Entropy</th><th>Sortedness</th></tr>"
+    table = "<table style='width:100%;border-collapse:collapse'>"+header+"".join(rows)+"</table>"
+    # simple row borders
+    table = table.replace("<tr>", "<tr style='border-bottom:1px solid #e5e7eb'>")
+    table = table.replace("<th>", "<th style='text-align:left;padding:8px 6px;color:#374151'>")
+    table = table.replace("<td>", "<td style='padding:8px 6px;color:#111827'>")
+    return table
+# -------------------------------
+# Gradio app
+# -------------------------------
 def analyze(file):
     if file is None:
+        return "{}", "Please upload a CSV.", "", ""
     try:
         df = pd.read_csv(file.name)
     except Exception as e:
+        return "{}", f"Failed to read CSV: {e}", "", ""
     report = compute_metrics(df)
+    interp = interpret_report(report)
+    report_json = json.dumps(report, indent=2)
+    dashboard_html = render_dashboard(report, interp)
+    recs_html = render_recs(interp)
+    cols_html = render_columns(report)
+    return report_json, dashboard_html, recs_html, cols_html
+with gr.Blocks(title="OrderLens — Data Interpreter") as demo:
+    gr.Markdown("# OrderLens — Data Interpreter")
+    gr.Markdown("Upload a CSV and get **readable** structure metrics with plain-language guidance.")
     with gr.Row():
+        inp = gr.File(file_types=[\".csv\"], label=\"CSV file\")
+    btn = gr.Button(\"Analyze\", variant=\"primary\")
+    gr.Markdown(\"---\")
+    gr.Markdown(\"### Dashboard\")  # color-coded cards + verdict
+    dash = gr.HTML()
+    gr.Markdown(\"### Recommendations\")  # actionable tips
+    recs = gr.HTML()
+    gr.Markdown(\"### Column Details\")  # per-column table
+    cols = gr.HTML()
+    gr.Markdown(\"### Raw report (JSON)\")  # API-friendly
+    json_out = gr.Code(label=\"Report\", language=\"json\")
+    btn.click(analyze, inputs=inp, outputs=[json_out, dash, recs, cols])
+if __name__ == \"__main__\":
     demo.launch()