Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse filesupdates for reliability and parallel processing
app.py
CHANGED
@@ -1,22 +1,19 @@
|
|
1 |
import os
|
2 |
import subprocess
|
3 |
import gradio as gr
|
4 |
-
from
|
5 |
-
|
|
|
|
|
|
|
6 |
|
7 |
-
#
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
if not hf_token:
|
12 |
-
raise ValueError("HF_TOKEN environment variable is not set")
|
13 |
-
if not hf_user:
|
14 |
-
raise ValueError("SPACE_AUTHOR_NAME environment variable is not set")
|
15 |
-
|
16 |
-
# Perform login using the token
|
17 |
-
# login(token=hf_token, add_to_git_credential=True)
|
18 |
|
|
|
19 |
SUPPORTED_FILE_TYPES = ["txt", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html"]
|
|
|
20 |
|
21 |
def validate_url(url):
|
22 |
return url.startswith('https://')
|
@@ -24,7 +21,6 @@ def validate_url(url):
|
|
24 |
def clone_repo(url, repo_dir, hf_token, hf_user):
|
25 |
env = os.environ.copy()
|
26 |
env['GIT_LFS_SKIP_SMUDGE'] = '1'
|
27 |
-
# Construct the Git URL with the token and author name for authentication
|
28 |
token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
|
29 |
result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
|
30 |
if result.returncode != 0:
|
@@ -40,14 +36,21 @@ def get_file_summary(file_path, file_type):
|
|
40 |
}
|
41 |
|
42 |
def read_file_content(file_path):
|
43 |
-
with open(file_path, "
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
def validate_file_types(directory):
|
|
|
47 |
m = Magika()
|
48 |
file_types = {}
|
49 |
for root, _, files in os.walk(directory):
|
50 |
-
if '.git'
|
51 |
continue
|
52 |
for file_name in files:
|
53 |
file_path = os.path.join(root, file_name)
|
@@ -55,38 +58,55 @@ def validate_file_types(directory):
|
|
55 |
with open(file_path, 'rb') as file:
|
56 |
file_bytes = file.read()
|
57 |
result = m.identify_bytes(file_bytes)
|
58 |
-
|
|
|
|
|
|
|
59 |
except Exception as e:
|
60 |
file_types[file_path] = f"Error: {str(e)}"
|
61 |
return file_types
|
62 |
|
63 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
if not validate_url(url):
|
65 |
return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
|
66 |
|
67 |
-
repo_dir = "
|
68 |
-
if os.path.exists(repo_dir):
|
69 |
-
subprocess.run(["rm", "-rf", repo_dir])
|
70 |
-
|
71 |
success, error = clone_repo(url, repo_dir, hf_token, hf_user)
|
72 |
if not success:
|
73 |
return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
|
74 |
|
75 |
-
file_types = validate_file_types(repo_dir)
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
try:
|
83 |
-
content["content"] = read_file_content(file_path)
|
84 |
-
except Exception as e:
|
85 |
-
content["content"] = f"Failed to read file content: {str(e)}"
|
86 |
-
else:
|
87 |
-
content["content"] = "File too large or binary, content not captured."
|
88 |
|
89 |
-
extracted_content
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
# Cleanup temporary directory
|
92 |
subprocess.run(["rm", "-rf", repo_dir])
|
@@ -106,12 +126,20 @@ def format_output(extracted_content, repo_url):
|
|
106 |
formatted_output += "Error in file data format.\n"
|
107 |
return formatted_output
|
108 |
|
109 |
-
def extract_and_display(url):
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
formatted_output = format_output(extracted_content, url)
|
112 |
return formatted_output
|
113 |
|
114 |
-
app = gr.Blocks(
|
115 |
|
116 |
with app:
|
117 |
gr.Markdown("# Hugging Face Space / Model Repository Content Extractor")
|
@@ -125,9 +153,11 @@ with app:
|
|
125 |
],
|
126 |
inputs=url_input
|
127 |
)
|
128 |
-
|
|
|
|
|
129 |
extract_button = gr.Button("Extract Content")
|
130 |
|
131 |
-
extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)
|
132 |
|
133 |
-
app.launch()
|
|
|
1 |
import os
|
2 |
import subprocess
|
3 |
import gradio as gr
|
4 |
+
from tqdm import tqdm
|
5 |
+
import chardet
|
6 |
+
import logging
|
7 |
+
import tempfile
|
8 |
+
import concurrent.futures
|
9 |
|
10 |
+
# Configure logging
|
11 |
+
logging.basicConfig(level=logging.INFO)
|
12 |
+
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
# Configurable supported file types and size limit
|
15 |
SUPPORTED_FILE_TYPES = ["txt", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html"]
|
16 |
+
MAX_FILE_SIZE = 32 * 1024 # 32 KB
|
17 |
|
18 |
def validate_url(url):
|
19 |
return url.startswith('https://')
|
|
|
21 |
def clone_repo(url, repo_dir, hf_token, hf_user):
|
22 |
env = os.environ.copy()
|
23 |
env['GIT_LFS_SKIP_SMUDGE'] = '1'
|
|
|
24 |
token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
|
25 |
result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
|
26 |
if result.returncode != 0:
|
|
|
36 |
}
|
37 |
|
38 |
def read_file_content(file_path):
|
39 |
+
with open(file_path, "rb") as file:
|
40 |
+
file_bytes = file.read()
|
41 |
+
encoding = chardet.detect(file_bytes)["encoding"]
|
42 |
+
try:
|
43 |
+
content = file_bytes.decode(encoding)
|
44 |
+
return content
|
45 |
+
except (UnicodeDecodeError, TypeError):
|
46 |
+
return None
|
47 |
|
48 |
+
def validate_file_types(directory, supported_file_types):
|
49 |
+
from magika import Magika
|
50 |
m = Magika()
|
51 |
file_types = {}
|
52 |
for root, _, files in os.walk(directory):
|
53 |
+
if any(dir_name in root for dir_name in ['.git', '__pycache__']):
|
54 |
continue
|
55 |
for file_name in files:
|
56 |
file_path = os.path.join(root, file_name)
|
|
|
58 |
with open(file_path, 'rb') as file:
|
59 |
file_bytes = file.read()
|
60 |
result = m.identify_bytes(file_bytes)
|
61 |
+
file_type = result.output.ct_label
|
62 |
+
if file_type not in supported_file_types:
|
63 |
+
file_type = "Unsupported"
|
64 |
+
file_types[file_path] = file_type
|
65 |
except Exception as e:
|
66 |
file_types[file_path] = f"Error: {str(e)}"
|
67 |
return file_types
|
68 |
|
69 |
+
def process_file(file_path, file_type, max_file_size):
|
70 |
+
file_summary = get_file_summary(file_path, file_type)
|
71 |
+
content = {"header": file_summary}
|
72 |
+
|
73 |
+
if file_type != "Unsupported" and file_summary["size"] <= max_file_size:
|
74 |
+
try:
|
75 |
+
file_content = read_file_content(file_path)
|
76 |
+
if file_content is not None:
|
77 |
+
content["content"] = file_content
|
78 |
+
else:
|
79 |
+
content["content"] = "Failed to read file content: Unsupported encoding or binary file."
|
80 |
+
except Exception as e:
|
81 |
+
content["content"] = f"Failed to read file content: {str(e)}"
|
82 |
+
else:
|
83 |
+
content["content"] = f"Skipped: {'File size exceeds limit.' if file_summary['size'] > max_file_size else 'Unsupported file type.'}"
|
84 |
+
|
85 |
+
return content
|
86 |
+
|
87 |
+
def extract_repo_content(url, hf_token, hf_user, supported_file_types, max_file_size):
|
88 |
if not validate_url(url):
|
89 |
return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
|
90 |
|
91 |
+
repo_dir = tempfile.mkdtemp(prefix="temp_repo_")
|
|
|
|
|
|
|
92 |
success, error = clone_repo(url, repo_dir, hf_token, hf_user)
|
93 |
if not success:
|
94 |
return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
|
95 |
|
96 |
+
file_types = validate_file_types(repo_dir, supported_file_types)
|
97 |
+
|
98 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
99 |
+
futures = []
|
100 |
+
for file_path, file_type in file_types.items():
|
101 |
+
future = executor.submit(process_file, file_path, file_type, max_file_size)
|
102 |
+
futures.append(future)
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
+
extracted_content = []
|
105 |
+
with tqdm(total=len(futures), desc="Processing files") as progress_bar:
|
106 |
+
for future in concurrent.futures.as_completed(futures):
|
107 |
+
content = future.result()
|
108 |
+
extracted_content.append(content)
|
109 |
+
progress_bar.update(1)
|
110 |
|
111 |
# Cleanup temporary directory
|
112 |
subprocess.run(["rm", "-rf", repo_dir])
|
|
|
126 |
formatted_output += "Error in file data format.\n"
|
127 |
return formatted_output
|
128 |
|
129 |
+
def extract_and_display(url, supported_file_types, max_file_size):
|
130 |
+
hf_token = os.getenv("HF_TOKEN")
|
131 |
+
hf_user = os.getenv("SPACE_AUTHOR_NAME")
|
132 |
+
|
133 |
+
if not hf_token:
|
134 |
+
raise ValueError("HF_TOKEN environment variable is not set")
|
135 |
+
if not hf_user:
|
136 |
+
raise ValueError("SPACE_AUTHOR_NAME environment variable is not set")
|
137 |
+
|
138 |
+
extracted_content = extract_repo_content(url, hf_token, hf_user, supported_file_types, max_file_size)
|
139 |
formatted_output = format_output(extracted_content, url)
|
140 |
return formatted_output
|
141 |
|
142 |
+
app = gr.Blocks()
|
143 |
|
144 |
with app:
|
145 |
gr.Markdown("# Hugging Face Space / Model Repository Content Extractor")
|
|
|
153 |
],
|
154 |
inputs=url_input
|
155 |
)
|
156 |
+
supported_file_types = gr.CheckboxGroup(SUPPORTED_FILE_TYPES, label="Supported File Types", info="Select the file types to include in the extraction.")
|
157 |
+
max_file_size = gr.Slider(1, 1024, value=32, step=1, label="Max File Size (KB)", info="Files larger than this size will be skipped.")
|
158 |
+
output_display = gr.Textbox(label="Extracted Repository Content", show_copy_button=True, lines=20, placeholder="Repository content will be extracted here...\n\nMetadata is captured for all files, but text content provided only for files less than the specified size limit.\n\n\n\nReview and search through the content here OR simply copy it for offline analysis!!. 🤖")
|
159 |
extract_button = gr.Button("Extract Content")
|
160 |
|
161 |
+
extract_button.click(fn=extract_and_display, inputs=[url_input, supported_file_types, max_file_size], outputs=output_display)
|
162 |
|
163 |
+
app.launch()
|