dejanseo commited on
Commit
be9bf79
·
verified ·
1 Parent(s): 0695df3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +246 -0
app.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import tempfile
5
+ import os
6
+ from dejan.veczip import veczip
7
+ import csv
8
+ import ast
9
+
10
+ def is_numeric(s):
11
+ """Checks if a given string is numeric."""
12
+ try:
13
+ float(s)
14
+ return True
15
+ except:
16
+ return False
17
+
18
+ def parse_as_array(val):
19
+ """Parses a string as an array of numbers."""
20
+ if isinstance(val, (int, float)):
21
+ return [val]
22
+ val_str = str(val).strip()
23
+ if val_str.startswith("[") and val_str.endswith("]"):
24
+ try:
25
+ arr = ast.literal_eval(val_str)
26
+ if isinstance(arr, list) and all(is_numeric(str(x)) for x in arr):
27
+ return arr
28
+ return None
29
+ except:
30
+ return None
31
+ parts = val_str.split(",")
32
+ if len(parts) > 1 and all(is_numeric(p.strip()) for p in parts):
33
+ return [float(p.strip()) for p in parts]
34
+ return None
35
+
36
+ def get_line_pattern(row):
37
+ """Detects the pattern (text, number, or array) of a row."""
38
+ pattern = []
39
+ for val in row:
40
+ arr = parse_as_array(val)
41
+ if arr is not None:
42
+ pattern.append('arr')
43
+ else:
44
+ if is_numeric(val):
45
+ pattern.append('num')
46
+ else:
47
+ pattern.append('text')
48
+ return pattern
49
+
50
+ def detect_header(lines):
51
+ """Detects if a CSV has a header."""
52
+ if len(lines) < 2:
53
+ return False
54
+ first_line_pattern = get_line_pattern(lines[0])
55
+ subsequent_patterns = [get_line_pattern(r) for r in lines[1:]]
56
+ if len(subsequent_patterns) > 1:
57
+ if all(p == subsequent_patterns[0] for p in subsequent_patterns) and first_line_pattern != subsequent_patterns[0]:
58
+ return True
59
+ else:
60
+ if subsequent_patterns and first_line_pattern != subsequent_patterns[0]:
61
+ return True
62
+ return False
63
+
64
+ def looks_like_id_column(col_values):
65
+ """Checks if a column looks like an ID column (sequential integers)."""
66
+ try:
67
+ nums = [int(float(v)) for v in col_values]
68
+ return nums == list(range(nums[0], nums[0] + len(nums)))
69
+ except:
70
+ return False
71
+
72
+ def detect_columns(file_path):
73
+ """Detects embedding and metadata columns in a CSV file."""
74
+ with open(file_path, "r", newline="", encoding="utf-8") as f:
75
+ try:
76
+ sample = f.read(1024*10) # Read a larger sample for sniffing
77
+ dialect = csv.Sniffer().sniff(sample, delimiters=[',','\t',';','|'])
78
+ delimiter = dialect.delimiter
79
+ except:
80
+ delimiter = ','
81
+ f.seek(0) # reset file pointer
82
+ reader = csv.reader(f, delimiter=delimiter)
83
+ first_lines = list(reader)[:10]
84
+
85
+ if not first_lines:
86
+ raise ValueError("No data")
87
+
88
+ has_header = detect_header(first_lines)
89
+ if has_header:
90
+ header = first_lines[0]
91
+ data = first_lines[1:]
92
+ else:
93
+ header = []
94
+ data = first_lines
95
+
96
+ if not data:
97
+ return has_header, [], [], delimiter
98
+
99
+ cols = list(zip(*data))
100
+
101
+ candidate_arrays = []
102
+ candidate_numeric = []
103
+ id_like_columns = set()
104
+ text_like_columns = set()
105
+
106
+ for ci, col in enumerate(cols):
107
+ col = list(col)
108
+ parsed_rows = [parse_as_array(val) for val in col]
109
+
110
+ if all(r is not None for r in parsed_rows):
111
+ lengths = {len(r) for r in parsed_rows}
112
+ if len(lengths) == 1:
113
+ candidate_arrays.append(ci)
114
+ continue
115
+ else:
116
+ text_like_columns.add(ci)
117
+ continue
118
+
119
+ if all(is_numeric(v) for v in col):
120
+ if looks_like_id_column(col):
121
+ id_like_columns.add(ci)
122
+ else:
123
+ candidate_numeric.append(ci)
124
+ else:
125
+ text_like_columns.add(ci)
126
+
127
+ identified_embedding_columns = set(candidate_arrays)
128
+ identified_metadata_columns = set()
129
+
130
+ if candidate_arrays:
131
+ identified_metadata_columns.update(candidate_numeric)
132
+ else:
133
+ if len(candidate_numeric) > 1:
134
+ identified_embedding_columns.update(candidate_numeric)
135
+ else:
136
+ identified_metadata_columns.update(candidate_numeric)
137
+
138
+ identified_metadata_columns.update(id_like_columns)
139
+ identified_metadata_columns.update(text_like_columns)
140
+
141
+
142
+ if header:
143
+ for ci, col_name in enumerate(header):
144
+ if col_name.lower() == 'id':
145
+ if ci in identified_embedding_columns:
146
+ identified_embedding_columns.remove(ci)
147
+ identified_metadata_columns.add(ci)
148
+ break
149
+
150
+ emb_cols = [header[i] if header and i < len(header) else i for i in identified_embedding_columns]
151
+ meta_cols = [header[i] if header and i < len(header) else i for i in identified_metadata_columns]
152
+
153
+
154
+ return has_header, emb_cols, meta_cols, delimiter
155
+
156
+
157
+ def load_and_validate_embeddings(input_file, target_dims):
158
+ """Loads, validates, and summarizes embedding data from a CSV."""
159
+ print(f"Loading data from {input_file}...")
160
+ has_header, embedding_columns, metadata_columns, delimiter = detect_columns(input_file)
161
+ data = pd.read_csv(input_file, header=0 if has_header else None, delimiter=delimiter)
162
+
163
+
164
+ def is_valid_row(row):
165
+ for col in embedding_columns:
166
+ if parse_as_array(row[col]) is None:
167
+ return False
168
+ return True
169
+
170
+ valid_rows_filter = data.apply(is_valid_row, axis=1)
171
+ data = data[valid_rows_filter]
172
+
173
+ print("\n=== File Summary ===")
174
+ print(f"File: {input_file}")
175
+ print(f"Rows: {len(data)}")
176
+ print(f"Metadata Columns: {metadata_columns}")
177
+ print(f"Embedding Columns: {embedding_columns}")
178
+ print("====================\n")
179
+
180
+ return data, embedding_columns, metadata_columns, has_header, list(data.columns)
181
+
182
+ def save_compressed_embeddings(output_file, metadata, compressed_embeddings, embedding_columns, original_columns, has_header):
183
+ """Saves compressed embeddings to a CSV file."""
184
+ print(f"Saving compressed data to {output_file}...")
185
+ metadata = metadata.copy()
186
+
187
+
188
+ for i, col in enumerate(embedding_columns):
189
+ metadata[col] = [compressed_embeddings[i][j].tolist() for j in range(compressed_embeddings[i].shape[0])]
190
+
191
+ header_option = True if has_header else False
192
+ final_df = metadata.reindex(columns=original_columns) if original_columns else metadata
193
+ final_df.to_csv(output_file, index=False, header=header_option)
194
+ print(f"Data saved to {output_file}.")
195
+
196
+ def run_veczip(input_file, target_dims=16):
197
+ """Runs veczip compression on the input data."""
198
+ data, embedding_columns, metadata_columns, has_header, original_columns = load_and_validate_embeddings(input_file, target_dims)
199
+
200
+ all_embeddings = []
201
+ for col in embedding_columns:
202
+ embeddings = np.array([parse_as_array(x) for x in data[col].values])
203
+ all_embeddings.append(embeddings)
204
+
205
+ combined_embeddings = np.concatenate(all_embeddings, axis=0)
206
+ compressor = veczip(target_dims=target_dims)
207
+ retained_indices = compressor.compress(combined_embeddings)
208
+
209
+
210
+ compressed_embeddings = []
211
+ for embeddings in all_embeddings:
212
+ compressed_embeddings.append(embeddings[:, retained_indices])
213
+
214
+ temp_output = tempfile.NamedTemporaryFile(suffix='.csv', delete=False)
215
+ save_compressed_embeddings(temp_output.name, data[metadata_columns], compressed_embeddings, embedding_columns, original_columns, has_header)
216
+ return temp_output.name
217
+
218
+ # Streamlit App
219
+ def main():
220
+ st.title("Veczip Embeddings Compressor")
221
+
222
+ uploaded_file = st.file_uploader("Upload CSV file with embeddings", type=["csv"])
223
+
224
+ if uploaded_file:
225
+ try:
226
+ with st.spinner("Analyzing and compressing embeddings..."):
227
+ temp_file = tempfile.NamedTemporaryFile(delete=False)
228
+ temp_file.write(uploaded_file.read())
229
+ temp_file.close()
230
+ output_file_path = run_veczip(temp_file.name)
231
+ with open(output_file_path, 'rb') as f:
232
+ st.download_button(
233
+ label="Download Compressed CSV",
234
+ data=f,
235
+ file_name="compressed_embeddings.csv",
236
+ mime="text/csv"
237
+ )
238
+ os.unlink(temp_file.name)
239
+ os.unlink(output_file_path)
240
+ st.success("Compression complete! Download your compressed file below.")
241
+ except Exception as e:
242
+ st.error(f"Error processing file: {e}")
243
+
244
+
245
+ if __name__ == "__main__":
246
+ main()