daqc commited on
Commit
b67af4a
·
verified ·
1 Parent(s): 08f110c

Upload 61 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +29 -0
  2. .env.template +18 -0
  3. .gitattributes +36 -0
  4. .gitignore +35 -0
  5. LICENSE +201 -0
  6. README-hf.md +14 -0
  7. README.md +52 -0
  8. app.py +1197 -0
  9. assets/images/.gitkeep +2 -0
  10. assets/images/image_logo.png +3 -0
  11. docs/README.md +32 -0
  12. docs/architecture.md +28 -0
  13. docs/contributing.md +28 -0
  14. docs/getting-started.md +34 -0
  15. docs/roadmap.md +7 -0
  16. docs/security.md +15 -0
  17. docs/tools.md +31 -0
  18. docs/troubleshooting.md +17 -0
  19. requirements.txt +132 -0
  20. runtime.txt +1 -0
  21. scripts/__pycache__/cookies.cpython-310.pyc +0 -0
  22. scripts/__pycache__/cvedb_tool.cpython-310.pyc +0 -0
  23. scripts/__pycache__/epss_tool.cpython-310.pyc +0 -0
  24. scripts/__pycache__/hf_tools.cpython-310.pyc +0 -0
  25. scripts/__pycache__/kevin_tool.cpython-310.pyc +0 -0
  26. scripts/__pycache__/mdconvert.cpython-310.pyc +0 -0
  27. scripts/__pycache__/nvd_tool.cpython-310.pyc +0 -0
  28. scripts/__pycache__/report_generator.cpython-310.pyc +0 -0
  29. scripts/__pycache__/text_inspector_tool.cpython-310.pyc +0 -0
  30. scripts/__pycache__/text_web_browser.cpython-310.pyc +0 -0
  31. scripts/__pycache__/visual_qa.cpython-310.pyc +0 -0
  32. scripts/cookies.py +715 -0
  33. scripts/gaia_scorer.py +124 -0
  34. scripts/hf_tools.py +867 -0
  35. scripts/mdconvert.py +982 -0
  36. scripts/reformulator.py +86 -0
  37. scripts/report_generator.py +153 -0
  38. scripts/run_agents.py +87 -0
  39. scripts/text_inspector_tool.py +88 -0
  40. scripts/text_web_browser.py +564 -0
  41. scripts/visual_qa.py +120 -0
  42. set-env.bat +11 -0
  43. tests/hf_tools_tests_output_20250822_034011.txt +2425 -0
  44. tests/run_all_hf_tools_tests.py +99 -0
  45. tests/test_hf_collection_get.py +26 -0
  46. tests/test_hf_collections_list.py +23 -0
  47. tests/test_hf_daily_papers.py +26 -0
  48. tests/test_hf_dataset_info.py +24 -0
  49. tests/test_hf_datasets_search.py +29 -0
  50. tests/test_hf_generate_dashboard_report.py +18 -0
.dockerignore ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ venv/
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ .Python
7
+ env/
8
+ pip-log.txt
9
+ pip-delete-this-directory.txt
10
+ .tox/
11
+ .coverage
12
+ .coverage.*
13
+ .cache
14
+ nosetests.xml
15
+ coverage.xml
16
+ *.cover
17
+ *.log
18
+ .git/
19
+ .mypy_cache/
20
+ .pytest_cache/
21
+ .hypothesis/
22
+ downloads_folder/
23
+ reports/
24
+ uploads/
25
+ *.html
26
+ *.pdf
27
+ *.docx
28
+ *.pptx
29
+ *.xlsx
.env.template ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copy this file to .env and uncomment/modify the variables you need.
2
+
3
+ # 1. HUGGING FACE CONFIGURATION (RECOMMENDED)
4
+ # For Hugging Face usage: Only HF_TOKEN is required or set it in UI
5
+ HF_TOKEN=your_huggingface_token_here
6
+ MODEL_ID=Qwen/Qwen3-Coder-480B-A35B-Instruct
7
+
8
+ # MODEL_ID=Qwen/Qwen3-Coder-30B-A3B-Instruct
9
+ # MODEL_ID=Qwen/Qwen2.5-Coder-32B-Instruct
10
+
11
+ # 2. OLLAMA CONFIGURATION (LOCAL MODELS)
12
+ # For Ollama usage: Uncomment MODEL_ID, OPENAI_API_BASE, and OPENAI_API_KEY
13
+ # This project uses OpenAI-compatible variables for Ollama integration
14
+
15
+ # MODEL_ID=qwen2.5-coder:7b # replace with any model (qwen2.5-coder:32b is recommended)
16
+ # OPENAI_API_BASE=http://localhost:11434/v1
17
+ # OPENAI_API_KEY=ollama
18
+
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/images/image_logo.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python virtual environment
2
+ venv/
3
+ env/
4
+ ENV/
5
+
6
+ # Python cache files
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+
11
+ # Distribution / packaging
12
+ dist/
13
+ build/
14
+ *.egg-info/
15
+
16
+ # Environment variables
17
+ .env
18
+
19
+ # IDE specific files
20
+ .idea/
21
+ .vscode/
22
+ *.swp
23
+ *.swo
24
+
25
+ # Project specific
26
+ downloads_folder/
27
+ *.log
28
+
29
+ # Jupyter Notebook
30
+ .ipynb_checkpoints
31
+
32
+ # Local development settings
33
+ *.local
34
+
35
+ .gradio
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README-hf.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Hugging Research
3
+ emoji: 🔎
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.42.0
8
+ app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ short_description: CodeAgent-based research assistant for the Hugging Face Hub
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ <p align="center">
3
+ <img src="assets/images/image_logo.png" alt="Hugging Research logo" width="120" />
4
+ </p>
5
+ <h1 align="center">Hugging Research</h1>
6
+
7
+ Hugging Research is a lightweight CodeAgent‑based research assistant for the Hugging Face Hub (models, datasets, Spaces, users, collections, papers). It gathers links via dedicated tools and organizes them for easy review.
8
+
9
+ ![screenshot](assets/images/ss1.png)
10
+
11
+ ## What it does
12
+ - Finds relevant models/datasets/Spaces/papers on the Hub
13
+ - Uses domain‑restricted search for tutorials and docs
14
+ - Avoids hallucinated links (only cites tool‑returned URLs)
15
+ - Organizes the found links into a simple, categorized view in the Report view
16
+
17
+ ## Quick start
18
+ 1) Clone and install
19
+ ```bash
20
+ git clone https://github.com/mcdaqc/hugging-research
21
+ cd hugging-research
22
+ python -m venv venv
23
+ venv\Scripts\activate # Windows
24
+ pip install -r requirements.txt
25
+ ```
26
+
27
+ 2) Configure your environment
28
+ ```bash
29
+ cp .env.template .env
30
+ # Edit .env and set:
31
+ # HF_TOKEN=hf_xxx # only for the inference model
32
+ # MODEL_ID=Qwen/Qwen3-Coder-480B-A35B-Instruct # optional
33
+ ```
34
+
35
+ 3) Run the app
36
+ ```bash
37
+ python app.py
38
+ # open http://localhost:7860
39
+ ```
40
+
41
+ 4) Use the app
42
+ - Enter your Hugging Face API key in the sidebar
43
+ - Click a Basic/Medium/Advanced example, or type your query in natural language
44
+ - Review the organized links in the Report view
45
+
46
+ ## Configuration
47
+ - `HF_TOKEN`: used for the inference model (agent). Tools are anonymous/read‑only.
48
+ - `MODEL_ID`: default `Qwen/Qwen3-Coder-480B-A35B-Instruct`.
49
+
50
+
51
+
52
+
app.py ADDED
@@ -0,0 +1,1197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import mimetypes
3
+ import os
4
+ import re
5
+ import shutil
6
+ import threading
7
+ import uuid
8
+ from typing import Optional
9
+ from loguru import logger
10
+ from datetime import datetime
11
+
12
+ import gradio as gr
13
+ from dotenv import load_dotenv
14
+ from huggingface_hub import login, HfApi
15
+ from smolagents import (
16
+ CodeAgent,
17
+ InferenceClientModel,
18
+ Tool,
19
+ DuckDuckGoSearchTool,
20
+ )
21
+ from smolagents.agent_types import (
22
+ AgentAudio,
23
+ AgentImage,
24
+ AgentText,
25
+ handle_agent_output_types,
26
+ )
27
+ from smolagents.gradio_ui import stream_to_gradio
28
+
29
+ from scripts.text_inspector_tool import TextInspectorTool
30
+ from scripts.text_web_browser import (
31
+ ArchiveSearchTool,
32
+ FinderTool,
33
+ FindNextTool,
34
+ PageDownTool,
35
+ PageUpTool,
36
+ SimpleTextBrowser,
37
+ VisitTool,
38
+ )
39
+ from scripts.visual_qa import visualizer
40
+ from scripts.report_generator import HFLinkReportTool
41
+ from scripts.hf_tools import (
42
+ HFModelsSearchTool,
43
+ HFModelInfoTool,
44
+ HFDatasetsSearchTool,
45
+ HFDatasetInfoTool,
46
+ HFSpacesSearchTool,
47
+ HFSpaceInfoTool,
48
+ HFUserInfoTool,
49
+ HFCollectionsListTool,
50
+ HFCollectionGetTool,
51
+ HFPaperInfoTool,
52
+ HFPaperReposTool,
53
+ HFDailyPapersTool,
54
+ HFRepoInfoTool,
55
+ HFSiteSearchTool,
56
+ )
57
+
58
+ # web_search = GoogleSearchTool(provider="serper")
59
+ web_search = DuckDuckGoSearchTool()
60
+
61
+ AUTHORIZED_IMPORTS = [
62
+ "requests",
63
+ "zipfile",
64
+ "pandas",
65
+ "numpy",
66
+ "sympy",
67
+ "json",
68
+ "bs4",
69
+ "pubchempy",
70
+ "xml",
71
+ "yahoo_finance",
72
+ "Bio",
73
+ "sklearn",
74
+ "scipy",
75
+ "pydub",
76
+ "PIL",
77
+ "chess",
78
+ "PyPDF2",
79
+ "pptx",
80
+ "torch",
81
+ "datetime",
82
+ "fractions",
83
+ "csv",
84
+ "plotly",
85
+ "plotly.express",
86
+ "plotly.graph_objects",
87
+ "jinja2",
88
+ ]
89
+
90
+ load_dotenv(override=True)
91
+
92
+ # Only login if HF_TOKEN is available and valid in environment
93
+ if os.getenv("HF_TOKEN"):
94
+ try:
95
+ login(os.getenv("HF_TOKEN"))
96
+ logger.info("Successfully logged in with HF_TOKEN from environment")
97
+ except Exception as e:
98
+ logger.warning(f"Failed to login with HF_TOKEN from environment: {e}")
99
+ logger.info("You can still use the application by providing a valid API key in the interface")
100
+
101
+ # Global session storage for independent user sessions
102
+ user_sessions = {}
103
+ session_lock = threading.Lock()
104
+
105
+ append_answer_lock = threading.Lock()
106
+
107
+ # Initialize browser
108
+ browser = SimpleTextBrowser(request_kwargs={})
109
+
110
+ def validate_hf_api_key(api_key: str) -> tuple[bool, str]:
111
+ """Validate Hugging Face API key by making a test request."""
112
+ if not api_key or not api_key.strip():
113
+ return False, "❌ API key cannot be empty"
114
+
115
+ api_key = api_key.strip()
116
+
117
+ # Basic format validation
118
+ if not api_key.startswith("hf_"):
119
+ return False, "❌ Invalid API key format. Hugging Face API keys start with 'hf_'"
120
+
121
+ try:
122
+ # Test the API key by making a simple request
123
+ api = HfApi(token=api_key)
124
+ # Try to get user info to validate the token
125
+ user_info = api.whoami()
126
+ return True, f"✅ API key validated successfully! Welcome, {user_info.get('name', 'User')}!"
127
+ except Exception as e:
128
+ return False, f"❌ Invalid API key: {str(e)}"
129
+
130
+ def create_model_with_api_key(hf_token: str, model_id: str = None) -> InferenceClientModel:
131
+ """Create a model instance with the provided API key."""
132
+ if not model_id:
133
+ model_id = "Qwen/Qwen2.5-Coder-32B-Instruct"
134
+
135
+ # Store original token
136
+ original_token = os.environ.get("HF_TOKEN")
137
+
138
+ try:
139
+ # Set the token in environment for this session
140
+ os.environ["HF_TOKEN"] = hf_token
141
+
142
+ # Create model without explicit token parameter
143
+ model = InferenceClientModel(
144
+ model_id=model_id,
145
+ )
146
+
147
+ return model
148
+ finally:
149
+ # Restore original token
150
+ if original_token:
151
+ os.environ["HF_TOKEN"] = original_token
152
+ elif "HF_TOKEN" in os.environ:
153
+ del os.environ["HF_TOKEN"]
154
+
155
+ def create_tools_with_model(model: InferenceClientModel):
156
+ """Create tools with the provided model."""
157
+ # Verify the model was created correctly
158
+ if model is None:
159
+ raise ValueError("Model is None, cannot create TextInspectorTool")
160
+
161
+ # Text inspector tool disabled for now (inspect_file_as_text)
162
+ # Reason: model attempted to use it with remote URLs; keep only for local uploads when re-enabled.
163
+ # ti_tool = TextInspectorTool(model, 20000)
164
+
165
+ # Hugging Face tools (public-only, anonymous)
166
+ hf_tools = [
167
+ HFModelsSearchTool(),
168
+ HFModelInfoTool(),
169
+ HFDatasetsSearchTool(),
170
+ HFDatasetInfoTool(),
171
+ HFSpacesSearchTool(),
172
+ HFSpaceInfoTool(),
173
+ HFUserInfoTool(),
174
+ HFCollectionsListTool(),
175
+ HFCollectionGetTool(),
176
+ HFPaperInfoTool(),
177
+ HFPaperReposTool(),
178
+ HFDailyPapersTool(),
179
+ HFRepoInfoTool(),
180
+ HFSiteSearchTool(),
181
+ ]
182
+
183
+ tools = hf_tools + [
184
+ web_search, # duckduckgo
185
+ VisitTool(browser),
186
+ PageUpTool(browser),
187
+ PageDownTool(browser),
188
+ FinderTool(browser),
189
+ FindNextTool(browser),
190
+ ArchiveSearchTool(browser),
191
+ # ti_tool, # TextInspectorTool (disabled) — only for uploaded local files; do not use with URLs
192
+ ]
193
+
194
+ return tools
195
+
196
+ # Agent creation in a factory function
197
+ def create_agent(hf_token: str = None, model_id: str = None, max_steps: int = 10):
198
+ """Creates a fresh agent instance for each session"""
199
+ if not hf_token:
200
+ raise ValueError("A valid Hugging Face API key is required to create an agent.")
201
+
202
+ logger.info(f"Creating agent with token: {hf_token[:10]}...")
203
+
204
+ # Use session-specific model with HF_TOKEN
205
+ model = create_model_with_api_key(hf_token, model_id)
206
+ tools = create_tools_with_model(model)
207
+
208
+ # TextInspectorTool temporarily disabled; skip presence check
209
+ # Previous enforcement kept for reference:
210
+ # has_text_inspector = any(getattr(tool, 'name', '') == 'inspect_file_as_text' for tool in tools)
211
+ # if not has_text_inspector:
212
+ # raise ValueError("TextInspectorTool not found in tools list")
213
+
214
+ agent = CodeAgent(
215
+ model=model,
216
+ tools=[visualizer] + tools,
217
+ max_steps=max_steps,
218
+ verbosity_level=1,
219
+ additional_authorized_imports=AUTHORIZED_IMPORTS,
220
+ planning_interval=4,
221
+ )
222
+
223
+ logger.info("Agent created successfully")
224
+ return agent
225
+
226
+ def get_user_session(request: gr.Request) -> str:
227
+ """Get or create a unique session ID for the user."""
228
+ if not request:
229
+ logger.warning("No request object, using random session ID")
230
+ return str(uuid.uuid4())
231
+
232
+ # Try to get session from headers or create new one
233
+ session_id = request.headers.get("x-session-id")
234
+ if not session_id:
235
+ # Use client IP and user agent as a more stable identifier
236
+ client_ip = request.client.host if hasattr(request, 'client') and request.client else "unknown"
237
+ user_agent = request.headers.get("user-agent", "unknown")
238
+ # Create a hash-based session ID for more stability
239
+ import hashlib
240
+ session_hash = hashlib.md5(f"{client_ip}:{user_agent}".encode()).hexdigest()
241
+ session_id = f"session_{session_hash[:8]}"
242
+ logger.info(f"Created stable session ID {session_id} for client {client_ip}")
243
+
244
+ return session_id
245
+
246
+ def get_stable_session_id(request: gr.Request) -> str:
247
+ """Get a stable session ID that persists across requests."""
248
+ if not request:
249
+ logger.warning("No request object, using random session ID")
250
+ return f"random_{str(uuid.uuid4())[:8]}"
251
+
252
+ # Use a combination of client info for more stable sessions
253
+ client_ip = getattr(request.client, 'host', 'unknown') if request.client else 'unknown'
254
+ user_agent = request.headers.get("user-agent", "unknown")
255
+
256
+ # Add additional uniqueness factors
257
+ accept_language = request.headers.get("accept-language", "unknown")
258
+ accept_encoding = request.headers.get("accept-encoding", "unknown")
259
+
260
+ # Create a more unique session ID
261
+ import hashlib
262
+ session_data = f"{client_ip}:{user_agent}:{accept_language}:{accept_encoding}"
263
+ session_hash = hashlib.md5(session_data.encode()).hexdigest()
264
+ session_id = f"user_{session_hash[:16]}"
265
+
266
+ logger.info(f"Generated session ID: {session_id}")
267
+ logger.info(f"Session data: {session_data}")
268
+
269
+ return session_id
270
+
271
+ def get_unique_session_id(request: gr.Request) -> str:
272
+ """Get a truly unique session ID for each request."""
273
+ if not request:
274
+ return f"unique_{str(uuid.uuid4())[:8]}"
275
+
276
+ # Use timestamp + client info for uniqueness
277
+ import time
278
+ timestamp = int(time.time() * 1000) # milliseconds
279
+ client_ip = getattr(request.client, 'host', 'unknown') if request.client else 'unknown'
280
+ user_agent = request.headers.get("user-agent", "unknown")
281
+
282
+ # Create a unique session ID
283
+ import hashlib
284
+ session_data = f"{timestamp}:{client_ip}:{user_agent}"
285
+ session_hash = hashlib.md5(session_data.encode()).hexdigest()
286
+ session_id = f"unique_{session_hash[:16]}"
287
+
288
+ logger.info(f"Generated unique session ID: {session_id}")
289
+
290
+ return session_id
291
+
292
+ def get_persistent_session_id(request: gr.Request) -> str:
293
+ """Get a persistent session ID that stays the same for the same client."""
294
+ if not request:
295
+ return f"persistent_{str(uuid.uuid4())[:8]}"
296
+
297
+ # Use only client info for persistence (no timestamp)
298
+ client_ip = getattr(request.client, 'host', 'unknown') if request.client else 'unknown'
299
+ user_agent = request.headers.get("user-agent", "unknown")
300
+ accept_language = request.headers.get("accept-language", "unknown")
301
+
302
+ # Create a persistent session ID
303
+ import hashlib
304
+ session_data = f"{client_ip}:{user_agent}:{accept_language}"
305
+ session_hash = hashlib.md5(session_data.encode()).hexdigest()
306
+ session_id = f"persistent_{session_hash[:16]}"
307
+
308
+ logger.info(f"Generated persistent session ID: {session_id}")
309
+ logger.info(f"Session data: {session_data}")
310
+
311
+ return session_id
312
+
313
+ def get_session_data(session_id: str) -> dict:
314
+ """Get session data for a specific user."""
315
+ with session_lock:
316
+ if session_id not in user_sessions:
317
+ user_sessions[session_id] = {
318
+ "hf_token": None,
319
+ "agent": None,
320
+ "max_steps": 10,
321
+ "created_at": datetime.now()
322
+ }
323
+ return user_sessions[session_id]
324
+
325
+ def clear_session_data(session_id: str):
326
+ """Clear session data for a specific user."""
327
+ with session_lock:
328
+ if session_id in user_sessions:
329
+ # Clear sensitive data
330
+ user_sessions[session_id]["hf_token"] = None
331
+ user_sessions[session_id]["agent"] = None
332
+ logger.info(f"Session {session_id[:8]}... cleared")
333
+
334
+ def clear_agent_only(session_id: str):
335
+ """Clear only the agent, keeping the API key for convenience."""
336
+ with session_lock:
337
+ if session_id in user_sessions:
338
+ if "agent" in user_sessions[session_id]:
339
+ del user_sessions[session_id]["agent"]
340
+ logger.info(f"Session {session_id[:8]}... agent cleared")
341
+
342
+
343
+
344
+ class GradioUI:
345
+ """A one-line interface to launch your agent in Gradio"""
346
+
347
+ def __init__(self, file_upload_folder: str | None = None):
348
+ self.file_upload_folder = file_upload_folder
349
+ if self.file_upload_folder is not None:
350
+ if not os.path.exists(file_upload_folder):
351
+ os.mkdir(file_upload_folder)
352
+ # No on-disk report saving; reports are rendered in-app only
353
+
354
+ def validate_api_key(self, api_key: str) -> tuple[str, str]:
355
+ """Validate API key and return status message."""
356
+ is_valid, message = validate_hf_api_key(api_key)
357
+ if is_valid:
358
+ return message, "success"
359
+ else:
360
+ return message, "error"
361
+
362
+ def interact_with_agent(self, prompt, messages, request: gr.Request):
363
+ """Handle agent interaction with proper session management."""
364
+ # Get unique session ID for this user
365
+ session_id = get_persistent_session_id(request)
366
+ session_data = get_session_data(session_id)
367
+
368
+ logger.info(f"Processing request for session {session_id}...")
369
+ logger.info(f"Request client: {request.client.host if request and request.client else 'unknown'}")
370
+ logger.info(f"Request user-agent: {request.headers.get('user-agent', 'unknown')[:50] if request else 'unknown'}")
371
+ logger.info(f"All active sessions: {list(user_sessions.keys())}")
372
+ logger.info(f"Session data for {session_id}: {session_data}")
373
+
374
+ # Check if we have a valid agent for this session
375
+ if not session_data.get("agent"):
376
+ # Check if we have a valid HF_TOKEN in session
377
+ hf_token = session_data.get("hf_token")
378
+
379
+ # If no token in session, try to get it from .env file
380
+ if not hf_token:
381
+ env_token = os.getenv("HF_TOKEN")
382
+ if env_token:
383
+ hf_token = env_token
384
+ session_data["hf_token"] = env_token
385
+ session_data["max_steps"] = 10 # Default max_steps
386
+ logger.info(f"Using HF_TOKEN from .env file for session {session_id[:8]}...")
387
+ else:
388
+ logger.warning(f"No API key found for session {session_id[:8]}...")
389
+ error_msg = "❌ No API key configured for your session. Please enter your Hugging Face API key in the API Configuration section above and click 'Setup API Key'."
390
+ messages.append(gr.ChatMessage(role="assistant", content=error_msg))
391
+ yield messages, "", ""
392
+ return
393
+
394
+ logger.info(f"Creating agent for session {session_id[:8]}...")
395
+
396
+ if hf_token:
397
+ try:
398
+ max_steps = session_data.get("max_steps", 10)
399
+ session_data["agent"] = create_agent(hf_token, model_id=os.getenv("MODEL_ID"), max_steps=max_steps)
400
+ logger.info(f"Agent created successfully for session {session_id[:8]}...")
401
+ except Exception as e:
402
+ logger.error(f"Failed to create agent for session {session_id[:8]}: {e}")
403
+ error_msg = f"❌ Failed to create agent with provided API key: {str(e)}"
404
+ messages.append(gr.ChatMessage(role="assistant", content=error_msg))
405
+ yield messages, "", ""
406
+ return
407
+ else:
408
+ logger.info(f"Agent already exists for session {session_id[:8]}...")
409
+
410
+ # Adding monitoring
411
+ try:
412
+ # log the existence of agent memory
413
+ has_memory = hasattr(session_data["agent"], "memory")
414
+ print(f"Agent has memory: {has_memory}")
415
+ if has_memory:
416
+ print(f"Memory type: {type(session_data['agent'].memory)}")
417
+
418
+ # Get current date for the prompt
419
+ from datetime import datetime
420
+ current_date = datetime.now().strftime("%Y-%m-%d")
421
+
422
+ # Prepare the system prompt (Hugging Search)
423
+ system_prompt = f"""You are Hugging Research, an assistant focused on Hugging Face content (models, datasets, Spaces, users, collections, papers) and related learning/blog/news.
424
+
425
+ TODAY'S DATE: {current_date}
426
+
427
+ STYLE
428
+ - Warm, collaborative, concise. use second person (you)
429
+
430
+ ACCESS BOUNDARIES
431
+ - Read‑only. Use only public information.
432
+ - If a tool indicates 401/403/private/gated, state "no access" and continue with other public sources.
433
+
434
+ AVAILABLE TOOLS
435
+ - web_search, visit, page_up, page_down, find, find_next, archive_search, visualizer
436
+ - hf_models_search, hf_model_info, hf_datasets_search, hf_dataset_info, hf_spaces_search, hf_space_info
437
+ - hf_user_info, hf_collections_list, hf_collection_get, hf_paper_info, hf_paper_repos, hf_daily_papers
438
+ - hf_repo_info, hf_site_search
439
+
440
+ LINK POLICY (anti‑hallucination)
441
+ - Only cite URLs that come directly from tool outputs. Never invent or guess links.
442
+ - Prefer official huggingface.co URLs for models/datasets/Spaces/papers.
443
+ - For tutorials/blogs/news, prefer huggingface.co when the same content exists there.
444
+ - If you need a URL that isn't present, first use a tool (web_search or hf_site_search) to retrieve it, then cite it.
445
+
446
+ TOOL USAGE POLICY
447
+ - You can write compact Python to orchestrate multiple tool calls in one block.
448
+ - Never dump large/raw JSON. If using python_interpreter, ensure visible output by printing a short structured summary (<=20 lines) or leaving a final expression; otherwise summarize in natural language.
449
+ - Keep parameters minimal: include query and limit; add owner only if asked; use a single pipeline_tag or tags only if explicitly implied; use sort/direction when asked or implied (default downloads/descending; 'trending' allowed).
450
+ - Default to limit=10 for searches unless the user explicitly asks for more.
451
+ - Use web_search to capture fresh/trending context; use hf_site_search for tutorials/blog/Learn.
452
+ - Use only the listed tools; do not call undefined helpers (e.g., visit_page).
453
+ - web_search returns plain text; never json.load or index it. Use it only for keywords or discovering links.
454
+ - hf_* tools return JSON serialized as string; always json.loads(...) before indexing keys like 'results' or 'item'.
455
+
456
+ STARTING MOVE
457
+ - Begin with multiple web_search to capture today‑relevant terms (include "Hugging Face" in the query when helpful). Derive 3–5 keywords and reuse them across hf_* calls.
458
+
459
+ DECISION RULES
460
+ - Prefer hf_* tools for official Hub content. Use derived keywords; do not rely only on date sort.
461
+ - Stop calling tools once you have enough signal for a confident, useful answer.
462
+
463
+ FINAL STEP GUIDANCE
464
+ - Do not call any dashboard/report tool. The app will automatically generate a dashboard from your final answer text for the Report tab. Focus on writing a clean Final Answer with accurate inline links derived from tool outputs.
465
+
466
+ OUTPUT REQUIREMENTS
467
+ - Provide a conversational summary tailored to the user’s goal.
468
+ - Structure: brief opening (what we looked for and why), key findings woven into short prose.
469
+ - Use inline links to official HF pages for repos and to reputable external sources for tutorials/news.
470
+ - Briefly mention at least one relevant item with inline links across these categories when available: models, datasets, Spaces, papers, blogs/docs, repositories, videos, news.
471
+
472
+ EXAMPLES (GOOD)
473
+ # Derive keywords then orchestrate searches
474
+ results_web = web_search(query="diffusion models Hugging Face latest")
475
+ import json
476
+ models = json.loads(hf_models_search(query="semantic search", limit=5)).get("results", [])
477
+ ds = json.loads(hf_datasets_search(query="semantic search", limit=5)).get("results", [])
478
+ repo = json.loads(hf_model_info(repo_id="sentence-transformers/all-MiniLM-L6-v2")).get("item")
479
+ spaces = json.loads(hf_spaces_search(query="whisper transcription", limit=5)).get("results", [])
480
+ learn = json.loads(hf_site_search(query="fine-tuning tutorial Hugging Face course", limit=5)).get("results", [])
481
+ # Final step: compose the final answer in natural language with inline links.
482
+ # The app will build a dashboard automatically from your final answer (no extra tool call needed).
483
+ final_answer_text = "We looked at semantic search models and datasets, including https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 ..."
484
+
485
+ Now is your turn to answer the user query.
486
+
487
+ User Query: """
488
+
489
+ # Combine system prompt with user message
490
+ full_prompt = system_prompt + prompt
491
+
492
+ # Extract clean message for display (remove internal context)
493
+ display_message = prompt
494
+ if "[INTERNAL CONTEXT:" in prompt:
495
+ display_message = prompt.split("[INTERNAL CONTEXT:")[0].strip()
496
+
497
+ messages.append(gr.ChatMessage(role="user", content=display_message))
498
+ yield messages, "", ""
499
+
500
+ logger.info(f"Starting agent interaction for session {session_id[:8]}...")
501
+ latest_assistant_text = ""
502
+ for msg in stream_to_gradio(
503
+ session_data["agent"], task=full_prompt, reset_agent_memory=False
504
+ ):
505
+ # If the message contains an HTML report, just pass it through (no on-disk saving)
506
+ # We render the dashboard in the Report tab below.
507
+ # (Intentionally no file saving)
508
+ messages.append(msg)
509
+ if getattr(msg, "role", None) == "assistant" and isinstance(msg.content, str):
510
+ latest_assistant_text = msg.content
511
+ yield messages, "", ""
512
+
513
+ # Clear sensitive data from session after interaction (AUTOMATIC)
514
+ # Note: We clear the agent but keep the API key for convenience
515
+ if "agent" in session_data:
516
+ del session_data["agent"]
517
+ logger.info(f"Session {session_id[:8]}... agent cleared after interaction")
518
+
519
+ # Build Report tab content
520
+ last_answer = latest_assistant_text or ""
521
+ report_md = ""
522
+ if display_message or last_answer:
523
+ report_md = f"### Prompt\n{display_message}\n\n{last_answer}"
524
+ # Generate report HTML from the final answer
525
+ dashboard_html = ""
526
+ try:
527
+ dashboard_html = HFLinkReportTool().forward(final_answer=last_answer, query=display_message)
528
+ except Exception:
529
+ dashboard_html = ""
530
+ yield messages, report_md, dashboard_html
531
+ except Exception as e:
532
+ logger.error(f"Error in interaction for session {session_id[:8]}: {str(e)}")
533
+ print(f"Error in interaction: {str(e)}")
534
+ error_msg = f"❌ Error during interaction: {str(e)}"
535
+ messages.append(gr.ChatMessage(role="assistant", content=error_msg))
536
+ yield messages, "", ""
537
+
538
+ def setup_api_key(self, api_key: str, request: gr.Request) -> str:
539
+ """Setup API key for the user's session."""
540
+ # Get unique session ID for this user
541
+ session_id = get_persistent_session_id(request)
542
+ session_data = get_session_data(session_id)
543
+
544
+ logger.info(f"Setting up API key for session {session_id}...")
545
+ logger.info(f"Setup request client: {request.client.host if request and request.client else 'unknown'}")
546
+ logger.info(f"Setup request user-agent: {request.headers.get('user-agent', 'unknown')[:50] if request else 'unknown'}")
547
+ logger.info(f"All active sessions before setup: {list(user_sessions.keys())}")
548
+ logger.info(f"Session data before setup: {session_data}")
549
+
550
+ # Check if API key is provided from interface
551
+ if api_key and api_key.strip():
552
+ # Use the API key from interface
553
+ token_to_use = api_key.strip()
554
+ source = "interface"
555
+ else:
556
+ # Try to use token from .env file
557
+ env_token = os.getenv("HF_TOKEN")
558
+ if env_token:
559
+ token_to_use = env_token
560
+ source = ".env file"
561
+ else:
562
+ return "❌ No API key provided. Please enter your Hugging Face API key or set HF_TOKEN in your .env file."
563
+
564
+ # Validate the token
565
+ is_valid, message = validate_hf_api_key(token_to_use)
566
+
567
+ if is_valid:
568
+ # Store HF_TOKEN in session data
569
+ session_data["hf_token"] = token_to_use
570
+ session_data["max_steps"] = 10
571
+ logger.info(f"API key stored in session {session_id[:8]}... from {source}")
572
+ logger.info(f"Max steps set to fixed value: 10")
573
+
574
+ # Create new agent with the HF_TOKEN and max_steps
575
+ try:
576
+ session_data["agent"] = create_agent(token_to_use, model_id=os.getenv("MODEL_ID"), max_steps=10)
577
+ logger.info(f"Agent created successfully for session {session_id[:8]}...")
578
+ return f"✅ API key from {source} validated and agent created successfully! {message.split('!')[1] if '!' in message else ''}"
579
+ except Exception as e:
580
+ logger.error(f"Failed to create agent for session {session_id[:8]}: {e}")
581
+ return f"❌ Failed to create agent with API key from {source}: {str(e)}"
582
+ else:
583
+ logger.warning(f"Invalid API key for session {session_id[:8]}... from {source}")
584
+ return f"❌ Invalid API key from {source}: {message}"
585
+
586
+ def upload_file(
587
+ self,
588
+ file,
589
+ file_uploads_log,
590
+ allowed_file_types=[
591
+ "application/pdf",
592
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
593
+ "text/plain",
594
+ ],
595
+ ):
596
+ """
597
+ Handle file uploads, default allowed types are .pdf, .docx, and .txt
598
+ """
599
+ if file is None:
600
+ return gr.Textbox("No file uploaded", visible=True), file_uploads_log
601
+
602
+ try:
603
+ mime_type, _ = mimetypes.guess_type(file.name)
604
+ except Exception as e:
605
+ return gr.Textbox(f"Error: {e}", visible=True), file_uploads_log
606
+
607
+ if mime_type not in allowed_file_types:
608
+ return gr.Textbox("File type disallowed", visible=True), file_uploads_log
609
+
610
+ # Sanitize file name
611
+ original_name = os.path.basename(file.name)
612
+ sanitized_name = re.sub(
613
+ r"[^\w\-.]", "_", original_name
614
+ ) # Replace any non-alphanumeric, non-dash, or non-dot characters with underscores
615
+
616
+ type_to_ext = {}
617
+ for ext, t in mimetypes.types_map.items():
618
+ if t not in type_to_ext:
619
+ type_to_ext[t] = ext
620
+
621
+ # Ensure the extension correlates to the mime type
622
+ sanitized_name = sanitized_name.split(".")[:-1]
623
+ sanitized_name.append("" + type_to_ext[mime_type])
624
+ sanitized_name = "".join(sanitized_name)
625
+
626
+ # Save the uploaded file to the specified folder
627
+ file_path = os.path.join(
628
+ self.file_upload_folder, os.path.basename(sanitized_name)
629
+ )
630
+ shutil.copy(file.name, file_path)
631
+
632
+ return gr.Textbox(
633
+ f"File uploaded: {file_path}", visible=True
634
+ ), file_uploads_log + [file_path]
635
+
636
+ def log_user_message(self, text_input, file_uploads_log):
637
+ # Create the user message for display (clean, without file info)
638
+ display_message = text_input
639
+
640
+ # Create the internal message for the agent (with file context)
641
+ internal_message = text_input
642
+ if len(file_uploads_log) > 0:
643
+ file_names = [os.path.basename(f) for f in file_uploads_log]
644
+ file_paths = [f for f in file_uploads_log] # Full paths
645
+ # Note: inspect_file_as_text is currently disabled (only for local uploads when re-enabled)
646
+ internal_message += f"\n\n[Uploaded files available: {', '.join(file_names)}. You can reference their content if needed (plain text).]"
647
+
648
+ return (
649
+ internal_message, # This goes to the agent (with file context)
650
+ gr.Textbox(
651
+ value="",
652
+ interactive=False,
653
+ placeholder="Please wait while Steps are getting populated",
654
+ ),
655
+ gr.Button(interactive=False),
656
+ )
657
+
658
+ def detect_device(self, request: gr.Request):
659
+ # Check whether the user device is a mobile or a computer
660
+
661
+ if not request:
662
+ return "Desktop" # Default to desktop if no request info
663
+
664
+ # Method 1: Check sec-ch-ua-mobile header (most reliable)
665
+ is_mobile_header = request.headers.get("sec-ch-ua-mobile")
666
+ if is_mobile_header:
667
+ return "Mobile" if "?1" in is_mobile_header else "Desktop"
668
+
669
+ # Method 2: Check user-agent string
670
+ user_agent = request.headers.get("user-agent", "").lower()
671
+ mobile_keywords = ["android", "iphone", "ipad", "mobile", "phone", "tablet"]
672
+
673
+ # More comprehensive mobile detection
674
+ if any(keyword in user_agent for keyword in mobile_keywords):
675
+ return "Mobile"
676
+
677
+ # Check for mobile-specific patterns
678
+ if "mobile" in user_agent or "android" in user_agent or "iphone" in user_agent:
679
+ return "Mobile"
680
+
681
+ # Method 3: Check platform
682
+ platform = request.headers.get("sec-ch-ua-platform", "").lower()
683
+ if platform:
684
+ if platform in ['"android"', '"ios"']:
685
+ return "Mobile"
686
+ elif platform in ['"windows"', '"macos"', '"linux"']:
687
+ return "Desktop"
688
+
689
+ # Method 4: Check viewport width (if available)
690
+ viewport_width = request.headers.get("viewport-width")
691
+ if viewport_width:
692
+ try:
693
+ width = int(viewport_width)
694
+ return "Mobile" if width <= 768 else "Desktop"
695
+ except ValueError:
696
+ pass
697
+
698
+ # Default case if no clear indicators
699
+ return "Desktop"
700
+
701
+ def launch(self, **kwargs):
702
+ # Custom CSS for mobile optimization
703
+ custom_css = """
704
+ @media (max-width: 768px) {
705
+ .gradio-container {
706
+ max-width: 100% !important;
707
+ padding: 10px !important;
708
+ }
709
+ .main {
710
+ padding: 10px !important;
711
+ }
712
+ .chatbot {
713
+ max-height: 60vh !important;
714
+ }
715
+ .textbox {
716
+ font-size: 16px !important; /* Prevents zoom on iOS */
717
+ }
718
+ .button {
719
+ min-height: 44px !important; /* Better touch targets */
720
+ }
721
+ }
722
+ """
723
+
724
+ with gr.Blocks(theme="ocean", fill_height=True, css=custom_css) as demo:
725
+ # Different layouts for mobile and computer devices
726
+ @gr.render()
727
+ def layout(request: gr.Request):
728
+ device = self.detect_device(request)
729
+ print(f"device - {device}")
730
+ # Render layout with sidebar
731
+ # Prepare logo as data URI for reliable rendering
732
+ try:
733
+ import base64
734
+ _logo_src = ""
735
+ _used = ""
736
+ for _p in ("assets/images/@image_logo.png", "assets/images/image_logo.png"):
737
+ if os.path.exists(_p):
738
+ with open(_p, "rb") as _lf:
739
+ _b64 = base64.b64encode(_lf.read()).decode("ascii")
740
+ _logo_src = f"data:image/png;base64,{_b64}"
741
+ _used = _p
742
+ break
743
+ print(f"Logo path used: {_used or 'none'}")
744
+ except Exception as _e:
745
+ print(f"Logo load error: {_e}")
746
+ _logo_src = ""
747
+ _logo_img_html = (
748
+ f'<img src="{_logo_src}" alt="App logo" style="vertical-align: middle; margin-right: 10px; height: 40px; display:inline-block;">'
749
+ if _logo_src else ""
750
+ )
751
+ if device == "Desktop":
752
+ with gr.Blocks(
753
+ fill_height=True,
754
+ ):
755
+ file_uploads_log = gr.State([])
756
+ with gr.Sidebar():
757
+ # Project title and repository link at the top
758
+ gr.Markdown(value=f"<h1 style=\"display:flex; align-items:center; gap:10px; margin:0; text-align:left;\">{_logo_img_html}Hugging Research</h1>")
759
+ gr.Markdown("""<img src=\"https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png\" width=\"20\" height=\"20\" style=\"display: inline-block; vertical-align: middle; margin-right: 8px;\"> <a href=\"https://github.com/mcdaqc/hugging-research\" target=\"_blank\">Github Repository</a>""")
760
+
761
+ # About section
762
+ with gr.Accordion("ℹ️ About", open=False):
763
+ gr.Markdown("""**What it does:**
764
+ Hugging Research finds Hugging Face models, datasets, and Spaces with direct links and short summaries.
765
+
766
+ **Available tools:**
767
+ - Hugging Face Hub API endpoints (via hf_* tools) — see [Hub API](https://huggingface.co/docs/hub/en/api)
768
+ - Web search + basic navigation (DuckDuckGo `web_search`, `visit`, `page_up/down`, `find`, `archive_search`)
769
+
770
+ **Model configuration:**
771
+ - Default: Qwen/Qwen3-Coder-480B-A35B-Instruct (HF Inference API)
772
+ - Optional: Ollama/local via `.env`
773
+
774
+ **How to use:**
775
+ - Enter your Hugging Face API key
776
+ - Ask in natural language (e.g., models/datasets/spaces by topic or owner; or “I’m new to LLMs/fine‑tuning—where to start?”)
777
+ - Get concise, linked results""")
778
+
779
+ with gr.Group():
780
+ gr.Markdown("**Your request**", container=True)
781
+ text_input = gr.Textbox(
782
+ lines=3,
783
+ label="Your request",
784
+ container=False,
785
+ placeholder="Enter your prompt here and press Shift+Enter or press the button",
786
+ )
787
+ launch_research_btn = gr.Button(
788
+ "Run", variant="primary"
789
+ )
790
+
791
+ # Examples Section
792
+ with gr.Accordion("💡 Example Prompts", open=False):
793
+ gr.Markdown("**Click any example below to populate your request field:**")
794
+
795
+ example_btn_1 = gr.Button("Basic: Tiny chatbot", size="sm", variant="secondary")
796
+ example_btn_2 = gr.Button("Medium: RAG Q&A", size="sm", variant="secondary")
797
+ example_btn_3 = gr.Button("Advanced: Instr. tuning", size="sm", variant="secondary")
798
+
799
+ # Example button events
800
+ example_btn_1.click(
801
+ lambda: "I want a small chatbot I can run on my laptop. Recommend a few lightweight chat models, a small dialogue dataset to fine‑tune, and a beginner‑friendly finetuning guide. Include a Space I can duplicate or clear steps to run locally.",
802
+ None,
803
+ [text_input]
804
+ )
805
+ example_btn_2.click(
806
+ lambda: "I'm building a document Q&A RAG app. Recommend CPU‑friendly embedding models and an optional reranker, give sensible chunk size and overlap defaults, suggest a small starter dataset, link a Space I can duplicate or a repo for an end‑to‑end pipeline, and provide a short guide to evaluate answer quality.",
807
+ None,
808
+ [text_input]
809
+ )
810
+ example_btn_3.click(
811
+ lambda: "I'm exploring instruction‑tuning and preference optimization for code LLMs. Surface recent papers from 2024 and 2025 on SFT, DPO, ORPO, and GRPO, link relevant datasets, models and repos that implement these methods, outline typical training and evaluation setups, and highlight open challenges and safety notes.",
812
+ None,
813
+ [text_input]
814
+ )
815
+
816
+ # API Key Configuration Section
817
+ with gr.Accordion("🔑 API Configuration", open=False):
818
+ gr.Markdown("**Configure your Hugging Face API Key**")
819
+ gr.Markdown("🔒 **Security**: Your API key is only kept during this session.")
820
+ gr.Markdown("Get your API key from: https://huggingface.co/settings/tokens")
821
+
822
+ api_key_input = gr.Textbox(
823
+ label="Hugging Face API Key",
824
+ placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
825
+ type="password",
826
+ lines=1
827
+ )
828
+ api_key_status = gr.Textbox(
829
+ label="Status",
830
+ value="✅ HF_TOKEN found in .env file. To use a different key, enter it above and click 'Setup API Key'." if os.getenv("HF_TOKEN") else "⚠️ Please enter your Hugging Face API key above and click 'Setup API Key' to start using the application.",
831
+ interactive=False
832
+ )
833
+
834
+ # Agent configuration (fixed steps)
835
+ gr.Markdown("**Agent Configuration** — steps are fixed for stability.")
836
+
837
+ setup_api_btn = gr.Button("Setup API Key", variant="secondary")
838
+
839
+ # If an upload folder is provided, enable the upload feature
840
+ # COMMENTED: File upload feature temporarily disabled - works but consumes too many steps for parsing
841
+ # TODO: Re-enable after optimizing TextInspectorTool to use fewer steps
842
+ # if self.file_upload_folder is not None:
843
+ # upload_file = gr.File(label="Upload a file")
844
+ # upload_status = gr.Textbox(
845
+ # label="Upload Status",
846
+ # interactive=False,
847
+ # visible=False,
848
+ # )
849
+ # upload_file.change(
850
+ # self.upload_file,
851
+ # [upload_file, file_uploads_log],
852
+ # [upload_status, file_uploads_log],
853
+ # )
854
+
855
+ # Powered by smolagents
856
+ with gr.Row():
857
+ gr.HTML("""<div style="display: flex; align-items: center; gap: 8px; font-family: system-ui, -apple-system, sans-serif;">Powered by
858
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png" style="width: 32px; height: 32px; object-fit: contain;" alt="logo">
859
+ <a target="_blank" href="https://github.com/huggingface/smolagents"><b>hf/smolagents</b></a>
860
+ </div>""")
861
+
862
+ # Chat interface
863
+ stored_messages = gr.State([])
864
+ chatbot = gr.Chatbot(
865
+ label="open-Deep-Research",
866
+ type="messages",
867
+ avatar_images=(
868
+ None,
869
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png",
870
+ ),
871
+ resizeable=False,
872
+ scale=1,
873
+ elem_id="my-chatbot",
874
+ )
875
+
876
+ # Tabs for Research and Report
877
+ with gr.Tabs():
878
+ with gr.Tab("Report"):
879
+ report_markdown = gr.Markdown(value="")
880
+ report_dashboard = gr.HTML(value="")
881
+
882
+ # API Key setup event
883
+ setup_api_btn.click(
884
+ self.setup_api_key,
885
+ [api_key_input],
886
+ [api_key_status]
887
+ )
888
+
889
+
890
+
891
+ text_input.submit(
892
+ self.log_user_message,
893
+ [text_input, file_uploads_log],
894
+ [stored_messages, text_input, launch_research_btn],
895
+ ).then(
896
+ self.interact_with_agent,
897
+ [stored_messages, chatbot],
898
+ [chatbot, report_markdown, report_dashboard],
899
+ ).then(
900
+ lambda: (
901
+ gr.Textbox(
902
+ interactive=True,
903
+ placeholder="Enter your prompt here and press the button",
904
+ ),
905
+ gr.Button(interactive=True),
906
+ ),
907
+ None,
908
+ [text_input, launch_research_btn],
909
+ )
910
+ launch_research_btn.click(
911
+ self.log_user_message,
912
+ [text_input, file_uploads_log],
913
+ [stored_messages, text_input, launch_research_btn],
914
+ ).then(
915
+ self.interact_with_agent,
916
+ [stored_messages, chatbot],
917
+ [chatbot, report_markdown, report_dashboard],
918
+ ).then(
919
+ lambda: (
920
+ gr.Textbox(
921
+ interactive=True,
922
+ placeholder="Enter your prompt here and press the button",
923
+ ),
924
+ gr.Button(interactive=True),
925
+ ),
926
+ None,
927
+ [text_input, launch_research_btn],
928
+ )
929
+
930
+ # Render simple layout for mobile
931
+ else:
932
+ try:
933
+ with gr.Blocks(
934
+ fill_height=True,
935
+ ):
936
+ # Project title and repository link at the top
937
+ gr.Markdown(value=f"<h1 style=\"display:flex; align-items:center; gap:10px; margin:0; text-align:left;\">{_logo_img_html}Hugging Research</h1>")
938
+ gr.Markdown("""<img src=\"https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png\" width=\"20\" height=\"20\" style=\"display: inline-block; vertical-align: middle; margin-right: 8px;\"> <a href=\"https://github.com/mcdaqc/hugging-research\" target=\"_blank\">Github Repository</a>""")
939
+
940
+ # About section for mobile
941
+ with gr.Accordion("ℹ️ About", open=False):
942
+ gr.Markdown("""**What it does:**
943
+ Hugging Research finds Hugging Face models, datasets, and Spaces with direct links and short summaries.
944
+
945
+ **Available tools:**
946
+ - Hugging Face Hub API endpoints (via hf_* tools) — see [Hub API](https://huggingface.co/docs/hub/en/api)
947
+ - Web search + basic navigation (DuckDuckGo `web_search`, `visit`, `page_up/down`, `find`, `archive_search`)
948
+
949
+ **Model configuration:**
950
+ - Default: Qwen/Qwen3-Coder-480B-A35B-Instruct (HF Inference API)
951
+ - Optional: Ollama/local via `.env`
952
+
953
+ **How to use:**
954
+ - Enter your Hugging Face API key
955
+ - Ask in natural language (e.g., models/datasets/spaces by topic or owner; or “I’m new to LLMs/fine‑tuning—where to start?”)
956
+ - Get concise, linked results""")
957
+
958
+ # API Key Configuration Section for Mobile
959
+ with gr.Accordion("🔑 API Configuration", open=False):
960
+ gr.Markdown("**Configure your Hugging Face API Key**")
961
+ gr.Markdown("🔒 **Security**: Your API key is only kept during this session.")
962
+ gr.Markdown("Get your API key from: https://huggingface.co/settings/tokens")
963
+
964
+ mobile_api_key_input = gr.Textbox(
965
+ label="Hugging Face API Key",
966
+ placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
967
+ type="password",
968
+ lines=1
969
+ )
970
+ mobile_api_key_status = gr.Textbox(
971
+ label="Status",
972
+ value="✅ HF_TOKEN found in .env file. To use a different key, enter it above and click 'Setup API Key'." if os.getenv("HF_TOKEN") else "⚠️ Please enter your Hugging Face API key above and click 'Setup API Key' to start using the application.",
973
+ interactive=False
974
+ )
975
+
976
+ # Agent configuration for mobile
977
+ gr.Markdown("**Agent Configuration**")
978
+ mobile_max_steps_slider = gr.Slider(
979
+ minimum=5,
980
+ maximum=30,
981
+ value=10,
982
+ step=1,
983
+ label="Maximum Steps",
984
+ info="Number of steps the agent can take per session (higher = more detailed but slower)"
985
+ )
986
+
987
+ mobile_setup_api_btn = gr.Button("Setup API Key", variant="secondary")
988
+
989
+ # Chat interface for mobile
990
+ stored_messages = gr.State([])
991
+ file_uploads_log = gr.State([])
992
+ chatbot = gr.Chatbot(
993
+ label="open-Deep-Research",
994
+ type="messages",
995
+ avatar_images=(
996
+ None,
997
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png",
998
+ ),
999
+ resizeable=True,
1000
+ scale=1,
1001
+ )
1002
+
1003
+ # Input section for mobile
1004
+ text_input = gr.Textbox(
1005
+ lines=1,
1006
+ label="Your request",
1007
+ placeholder="Enter your prompt here and press the button",
1008
+ )
1009
+ launch_research_btn = gr.Button(
1010
+ "Run",
1011
+ variant="primary",
1012
+ )
1013
+
1014
+ # File upload section for mobile (simple)
1015
+ # COMMENTED: File upload feature temporarily disabled - works but consumes too many steps for parsing
1016
+ # TODO: Re-enable after optimizing
1017
+ # if self.file_upload_folder is not None:
1018
+ # mobile_upload_file = gr.File(label="📎 Upload PDF/TXT file (optional)")
1019
+ # mobile_upload_status = gr.Textbox(
1020
+ # label="Upload Status",
1021
+ # interactive=False,
1022
+ # visible=False,
1023
+ # )
1024
+ # mobile_upload_file.change(
1025
+ # self.upload_file,
1026
+ # [mobile_upload_file, file_uploads_log],
1027
+ # [mobile_upload_status, file_uploads_log],
1028
+ # )
1029
+
1030
+ # Examples Section for Mobile
1031
+ with gr.Accordion("💡 Example Prompts", open=False):
1032
+ gr.Markdown("**Click any example below to populate your request field:**")
1033
+
1034
+ mobile_example_btn_1 = gr.Button("Basic: Tiny chatbot", size="sm", variant="secondary")
1035
+ mobile_example_btn_2 = gr.Button("Medium: RAG Q&A", size="sm", variant="secondary")
1036
+ mobile_example_btn_3 = gr.Button("Advanced: Instr. tuning", size="sm", variant="secondary")
1037
+
1038
+ # Powered by smolagents for mobile
1039
+ with gr.Row():
1040
+ gr.HTML("""<div style="display: flex; align-items: center; gap: 8px; font-family: system-ui, -apple-system, sans-serif;">Powered by
1041
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png" style="width: 32px; height: 32px; object-fit: contain;" alt="logo">
1042
+ <a target="_blank" href="https://github.com/huggingface/smolagents"><b>hf/smolagents</b></a>
1043
+ </div>""")
1044
+
1045
+ # Mobile API Key setup event
1046
+ mobile_setup_api_btn.click(
1047
+ self.setup_api_key,
1048
+ [mobile_api_key_input],
1049
+ [mobile_api_key_status]
1050
+ )
1051
+
1052
+ # Mobile Example button events
1053
+ mobile_example_btn_1.click(
1054
+ lambda: "I want a small chatbot I can run on my laptop. Recommend a few lightweight chat models, a small dialogue dataset to fine‑tune, and a beginner‑friendly finetuning guide. Include a Space I can duplicate or clear steps to run locally.",
1055
+ None,
1056
+ [text_input]
1057
+ )
1058
+ mobile_example_btn_2.click(
1059
+ lambda: "I'm building a document Q&A RAG app. Recommend CPU‑friendly embedding models and an optional reranker, give sensible chunk size and overlap defaults, suggest a small starter dataset, link a Space I can duplicate or a repo for an end‑to‑end pipeline, and provide a short guide to evaluate answer quality.",
1060
+ None,
1061
+ [text_input]
1062
+ )
1063
+ mobile_example_btn_3.click(
1064
+ lambda: "I'm exploring instruction‑tuning and preference optimization for code LLMs. Surface recent papers from 2024 and 2025 on SFT, DPO, ORPO, and GRPO, link relevant datasets, models and repos that implement these methods, outline typical training and evaluation setups, and highlight open challenges and safety notes.",
1065
+ None,
1066
+ [text_input]
1067
+ )
1068
+
1069
+ # Research and Report panels for mobile
1070
+ with gr.Tabs():
1071
+ with gr.Tab("Report"):
1072
+ m_report_markdown = gr.Markdown(value="")
1073
+ m_report_dashboard = gr.HTML(value="")
1074
+
1075
+ # Mobile chat events
1076
+ text_input.submit(
1077
+ self.log_user_message,
1078
+ [text_input, file_uploads_log],
1079
+ [stored_messages, text_input, launch_research_btn],
1080
+ ).then(
1081
+ self.interact_with_agent,
1082
+ [stored_messages, chatbot],
1083
+ [chatbot, m_report_markdown, m_report_dashboard],
1084
+ ).then(
1085
+ lambda: (
1086
+ gr.Textbox(
1087
+ interactive=True,
1088
+ placeholder="Enter your prompt here and press the button",
1089
+ ),
1090
+ gr.Button(interactive=True),
1091
+ ),
1092
+ None,
1093
+ [text_input, launch_research_btn],
1094
+ )
1095
+ launch_research_btn.click(
1096
+ self.log_user_message,
1097
+ [text_input, file_uploads_log],
1098
+ [stored_messages, text_input, launch_research_btn],
1099
+ ).then(
1100
+ self.interact_with_agent,
1101
+ [stored_messages, chatbot],
1102
+ [chatbot, m_report_markdown, m_report_dashboard],
1103
+ ).then(
1104
+ lambda: (
1105
+ gr.Textbox(
1106
+ interactive=True,
1107
+ placeholder="Enter your prompt here and press the button",
1108
+ ),
1109
+ gr.Button(interactive=True),
1110
+ ),
1111
+ None,
1112
+ [text_input, launch_research_btn],
1113
+ )
1114
+ except Exception as e:
1115
+ # Fallback to desktop layout if mobile layout fails
1116
+ logger.error(f"Mobile layout failed: {e}")
1117
+ # Re-render desktop layout as fallback
1118
+ with gr.Blocks(fill_height=True):
1119
+ gr.Markdown(value=f"<h1 style=\"display:flex; align-items:center; gap:10px; margin:0; text-align:left;\">{_logo_img_html}Hugging Research</h1>")
1120
+ gr.Markdown("""<img src=\"https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png\" width=\"20\" height=\"20\" style=\"display: inline-block; vertical-align: middle; margin-right: 8px;\"> <a href=\"https://github.com/mcdaqc/hugging-research\" target=\"_blank\">Github Repository</a>""")
1121
+ gr.Markdown("⚠️ Mobile layout failed, using desktop layout as fallback.")
1122
+
1123
+ # Simple fallback interface
1124
+ stored_messages = gr.State([])
1125
+ file_uploads_log = gr.State([])
1126
+ chatbot = gr.Chatbot(
1127
+ label="open-Deep-Research",
1128
+ type="messages",
1129
+ avatar_images=(
1130
+ None,
1131
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png",
1132
+ ),
1133
+ )
1134
+ with gr.Tabs():
1135
+ with gr.Tab("Report"):
1136
+ fb_report_markdown = gr.Markdown(value="")
1137
+ fb_report_dashboard = gr.HTML(value="")
1138
+
1139
+ text_input = gr.Textbox(
1140
+ lines=1,
1141
+ label="Your request",
1142
+ placeholder="Enter your prompt here and press the button",
1143
+ )
1144
+ launch_research_btn = gr.Button("Run", variant="primary")
1145
+
1146
+ # Fallback events
1147
+ text_input.submit(
1148
+ self.log_user_message,
1149
+ [text_input, file_uploads_log],
1150
+ [stored_messages, text_input, launch_research_btn],
1151
+ ).then(
1152
+ self.interact_with_agent,
1153
+ [stored_messages, chatbot],
1154
+ [chatbot, fb_report_markdown, fb_report_dashboard],
1155
+ )
1156
+ launch_research_btn.click(
1157
+ self.log_user_message,
1158
+ [text_input, file_uploads_log],
1159
+ [stored_messages, text_input, launch_research_btn],
1160
+ ).then(
1161
+ self.interact_with_agent,
1162
+ [stored_messages, chatbot],
1163
+ [chatbot, fb_report_markdown, fb_report_dashboard],
1164
+ )
1165
+
1166
+
1167
+ # Configure for Hugging Face Spaces compatibility
1168
+ is_spaces = os.getenv("SPACE_ID") is not None
1169
+
1170
+ if is_spaces:
1171
+ # Hugging Face Spaces configuration
1172
+ demo.launch(
1173
+ debug=False,
1174
+ server_name="0.0.0.0",
1175
+ server_port=int(os.getenv("PORT", 7860)),
1176
+ share=True,
1177
+ **kwargs
1178
+ )
1179
+ else:
1180
+ # Local development configuration
1181
+ demo.launch(
1182
+ debug=True,
1183
+ server_name="localhost",
1184
+ server_port=7860,
1185
+ share=False,
1186
+ **kwargs
1187
+ )
1188
+
1189
+ # Launch the application
1190
+ if __name__ == "__main__":
1191
+ try:
1192
+ GradioUI(file_upload_folder="uploads").launch()
1193
+ except KeyboardInterrupt:
1194
+ print("Application stopped by user")
1195
+ except Exception as e:
1196
+ print(f"Error starting application: {e}")
1197
+ raise
assets/images/.gitkeep ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+
2
+
assets/images/image_logo.png ADDED

Git LFS Details

  • SHA256: 40cff68237947762735e60a031b6c4826c7a3d4b2aff964ca3bc6a9b125e816b
  • Pointer size: 131 Bytes
  • Size of remote file: 750 kB
docs/README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Research — Documentation
2
+
3
+ Hugging Research is a lightweight research and coding assistant focused on Hugging Face Hub content. It helps you find models, datasets, Spaces, users, collections, and papers, and organizes links into a clean report.
4
+
5
+ - UI: Gradio Blocks
6
+ - Agent: smolagents `CodeAgent`
7
+ - Tools: `scripts/hf_tools.py` (anonymous, read‑only)
8
+ - Report: server-side generated from the final answer (no model call required)
9
+
10
+ ## Contents
11
+ - [Getting started](./getting-started.md)
12
+ - [Architecture](./architecture.md)
13
+ - [Tools](./tools.md)
14
+ - [Security](./security.md)
15
+ - [Troubleshooting](./troubleshooting.md)
16
+ - [Contributing](./contributing.md)
17
+ - [Roadmap](./roadmap.md)
18
+
19
+ ## What it does
20
+ - Searches Hugging Face Hub (models/datasets/Spaces/papers/users/collections)
21
+ - Pulls tutorials/blog/course content via domain‑restricted search when needed
22
+ - Avoids hallucinated links: only cites URLs from tool outputs
23
+ - Builds an HTML report of links in the Report view automatically
24
+
25
+ ## How it works (quick view)
26
+ - The agent uses `hf_*` tools (return JSON as strings) and `web_search` (returns plain text)
27
+ - The app converts the final answer into a categorized link report
28
+ - No files are written to disk for reports; HTML is rendered in‑app
29
+
30
+ Start here: [Getting started](./getting-started.md)
31
+
32
+
docs/architecture.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Architecture
2
+
3
+ ## Overview
4
+ Gradio UI with a single Report view. A smolagents `CodeAgent` uses `hf_*` tools to fetch public Hub data, then the app turns the final answer into a categorized HTML link report.
5
+
6
+ ## Components
7
+ - `app.py`
8
+ - Session and API key setup (for inference model only)
9
+ - Streams agent messages
10
+ - Generates report HTML from the final answer (no file writes)
11
+ - `scripts/hf_tools.py`
12
+ - Anonymous, read‑only wrappers of Hub APIs and domain‑restricted search
13
+ - Outputs JSON as strings
14
+ - `scripts/report_generator.py`
15
+ - Parses links in the final answer and renders a self‑contained HTML report
16
+
17
+ ## Flow
18
+ 1. User sends a prompt
19
+ 2. Agent calls `hf_*` tools and composes an answer with inline links
20
+ 3. App converts that answer into an HTML link report and shows it in the Report view
21
+
22
+ ## Privacy
23
+ - Tools never use tokens; gated/private items are marked as not accessible
24
+ - `HF_TOKEN` is only for the inference model
25
+
26
+ ## Extending
27
+ - Add tools in `scripts/hf_tools.py` and register in `create_tools_with_model`
28
+ - Update the system prompt to document tool contracts
docs/contributing.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing
2
+
3
+ ## Branches and PRs
4
+ - Branch from `main` using `feature/...` or `fix/...` prefixes
5
+ - Keep PRs focused and small; include screenshots for UI changes
6
+ - Link related issues if applicable
7
+
8
+ ## Code style
9
+ - Python 3.10+
10
+ - Descriptive names; functions as verbs, variables as nouns
11
+ - Handle errors explicitly; avoid silent excepts
12
+ - Keep indentation and formatting consistent with the repo
13
+
14
+ ## Commit messages
15
+ - Prefix: `feat:`, `fix:`, `docs:`, `refactor:`, `chore:`
16
+ - Keep them concise and meaningful
17
+
18
+ ## Tests / Manual checks
19
+ - Run the app and verify:
20
+ - Desktop and mobile UIs load
21
+ - Single Report view renders the link report from the last answer
22
+ - HF tools return JSON strings and handle errors
23
+
24
+ ## PR checklist
25
+ - [ ] Code builds and runs locally
26
+ - [ ] No secrets or tokens committed
27
+ - [ ] Updated docs if behavior/UI changed
28
+ - [ ] Included screenshots for UI updates
docs/getting-started.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Getting started
2
+
3
+ ## Requirements
4
+ - Python 3.10+
5
+ - Internet connection
6
+
7
+ ## Install
8
+ ```bash
9
+ git clone https://github.com/mcdaqc/hugging-research
10
+ cd hugging-research
11
+ python -m venv venv
12
+ # Windows
13
+ venv\Scripts\activate
14
+ # macOS/Linux
15
+ # source venv/bin/activate
16
+ pip install -r requirements.txt
17
+ ```
18
+
19
+ ## Configure
20
+ Create `.env` (token is only for the inference model; tools are anonymous):
21
+ ```ini
22
+ HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
23
+ MODEL_ID=Qwen/Qwen3-Coder-480B-A35B-Instruct
24
+ ```
25
+
26
+ ## Run
27
+ ```bash
28
+ python app.py
29
+ # open http://localhost:7860
30
+ ```
31
+
32
+ ## Notes for Spaces
33
+ - Set `HF_TOKEN` as a Space Secret if you want to choose a different inference model via `MODEL_ID`.
34
+ - Tools never use tokens; private/gated items will be marked as not accessible.
docs/roadmap.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Roadmap
2
+
3
+ - Report enhancements: optional filters, compact view
4
+ - Optional client-side export (PNG/PDF), no server deps
5
+ - Caching for Hub calls with short TTL
6
+ - Broader tests for tools and UI
7
+ - Split `hf_tools.py` into domain modules
docs/security.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Security & privacy
2
+
3
+ ## Principles
4
+ - Read‑only: tools never send Authorization headers
5
+ - Respect gated/private resources and label them as not accessible
6
+ - Don’t log secrets; `HF_TOKEN` is only for the inference model
7
+
8
+ ## Details
9
+ - Tools normalize `visibility` and `access` fields
10
+ - The Report view renders HTML in memory; no report files are saved
11
+
12
+
13
+ ## Scope
14
+ - No write operations to the Hub
15
+ - Only public endpoints and domain‑restricted search are used
docs/tools.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tools
2
+
3
+ All `hf_*` tools are anonymous, read‑only, and return JSON as a string.
4
+ - Listing tools → `{ "results": [...] }`
5
+ - Detail tools → `{ "item": {...} }`
6
+
7
+ Common metadata: `visibility`, `access`, `type`, `id`, `owner`, `url`, `likes`, `downloads`, `updatedAt`.
8
+
9
+ ## Searches
10
+ - `hf_models_search(query, owner?, task?, tags?, sort?, direction?, limit=10)`
11
+ - `hf_datasets_search(query, owner?, tags?, sort?, direction?, limit=10)`
12
+ - `hf_spaces_search(query, owner?, tags?, sort?, direction?, limit=10)`
13
+ - `hf_site_search(query, limit=10)` — Blog/Learn/Docs discovery (plain links)
14
+
15
+ ## Details
16
+ - `hf_model_info(repo_id)`
17
+ - `hf_dataset_info(repo_id)`
18
+ - `hf_space_info(repo_id)`
19
+ - `hf_user_info(username)`
20
+ - `hf_collections_list(owner)` / `hf_collection_get(owner, slug)`
21
+ - `hf_paper_info(arxiv_id)` / `hf_paper_repos(arxiv_id)` / `hf_daily_papers(date?)`
22
+ - `hf_repo_info(repo_type, repo_id)`
23
+
24
+ ## Parsing rules (important)
25
+ - `web_search` → returns plain text. Do not `json.loads`.
26
+ - `hf_*` tools → return JSON as a string. Always `json.loads(...)` before indexing.
27
+
28
+ ## Examples
29
+ - “Find top Stable Diffusion models.”
30
+ - “Datasets for Spanish sentiment analysis.”
31
+ - “Spaces for document Q&A.”
docs/troubleshooting.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Troubleshooting
2
+
3
+ ## UI/Queue errors (fn_index KeyError)
4
+ - After code changes, do a hard refresh (Ctrl+F5) or restart the server. Stale frontends can reference removed event handlers.
5
+
6
+ ## Tool output parsing
7
+ - `web_search` returns plain text. Do not `json.loads` or index it.
8
+ - `hf_*` tools return JSON as strings. Always `json.loads` before indexing.
9
+
10
+ ## Rate limits
11
+ - DuckDuckGo/site search can rate limit. Reduce `limit`, vary queries, retry later.
12
+
13
+ ## Windows Unicode
14
+ - If console prints error on Unicode, ensure UTF‑8 code page or encode output explicitly.
15
+
16
+ ## Access errors
17
+ - 401/403 is expected for gated/private. Tools should mark `access=no_access`.
requirements.txt ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.12.15
4
+ aiosignal==1.4.0
5
+ annotated-types==0.7.0
6
+ anthropic==0.54.0
7
+ anyio==4.10.0
8
+ async-timeout==5.0.1
9
+ attrs==25.3.0
10
+ beautifulsoup4==4.13.4
11
+ biopython==1.85
12
+ Brotli==1.1.0
13
+ certifi==2025.8.3
14
+ cffi==1.17.1
15
+ charset-normalizer==3.4.3
16
+ chess==1.11.2
17
+ click==8.2.1
18
+ cobble==0.1.4
19
+ colorama==0.4.6
20
+ cryptography==45.0.6
21
+ datasets==3.6.0
22
+ ddgs==9.5.4
23
+ defusedxml==0.7.1
24
+ dill==0.3.8
25
+ distro==1.9.0
26
+ duckduckgo_search==8.0.4
27
+ et_xmlfile==2.0.0
28
+ exceptiongroup==1.3.0
29
+ fastapi==0.116.1
30
+ ffmpy==0.6.1
31
+ filelock==3.19.1
32
+ frozenlist==1.7.0
33
+ fsspec==2025.3.0
34
+ google_search_results==2.4.2
35
+ gradio==5.42.0
36
+ gradio_client==1.11.1
37
+ groovy==0.1.2
38
+ h11==0.16.0
39
+ httpcore==1.0.9
40
+ httpx==0.28.1
41
+ httpx-sse==0.4.1
42
+ huggingface-hub==0.34.4
43
+ idna==3.10
44
+ Jinja2==3.1.6
45
+ jiter==0.10.0
46
+ joblib==1.5.1
47
+ loguru==0.7.3
48
+ lxml==6.0.0
49
+ mammoth==1.9.1
50
+ markdown-it-py==4.0.0
51
+ markdownify==1.1.0
52
+ MarkupSafe==3.0.2
53
+ mcp==1.9.3
54
+ mdurl==0.1.2
55
+ mpmath==1.3.0
56
+ multidict==6.6.4
57
+ multiprocess==0.70.16
58
+ narwhals==2.1.2
59
+ networkx==3.4.2
60
+ numexpr==2.11.0
61
+ numpy==2.2.6
62
+ openai==1.87.0
63
+ openpyxl==3.1.5
64
+ orjson==3.11.2
65
+ packaging==25.0
66
+ pandas==2.3.0
67
+ pathvalidate==3.3.1
68
+ pdfminer==20191125
69
+ pdfminer.six==20250506
70
+ pillow==11.2.1
71
+ plotly==6.1.2
72
+ primp==0.15.0
73
+ propcache==0.3.2
74
+ PubChemPy==1.0.4
75
+ puremagic==1.29
76
+ pyarrow==21.0.0
77
+ pycparser==2.22
78
+ pycryptodome==3.23.0
79
+ pydantic==2.11.7
80
+ pydantic-settings==2.10.1
81
+ pydantic_core==2.33.2
82
+ pydub==0.25.1
83
+ Pygments==2.19.2
84
+ pypdf==5.6.0
85
+ PyPDF2==3.0.1
86
+ python-dateutil==2.9.0.post0
87
+ python-dotenv==1.1.0
88
+ python-multipart==0.0.20
89
+ python-pptx==1.0.2
90
+ pytz==2025.2
91
+ PyYAML==6.0.2
92
+ regex==2025.7.34
93
+ requests==2.32.4
94
+ rich==14.1.0
95
+ ruff==0.12.9
96
+ safehttpx==0.1.6
97
+ safetensors==0.6.2
98
+ scikit-learn==1.7.0
99
+ scipy==1.15.3
100
+ semantic-version==2.10.0
101
+ serpapi==0.1.5
102
+ shellingham==1.5.4
103
+ simplejson==3.20.1
104
+ six==1.17.0
105
+ smolagents==1.21.1
106
+ sniffio==1.3.1
107
+ soupsieve==2.7
108
+ SpeechRecognition==3.14.3
109
+ sse-starlette==3.0.2
110
+ starlette==0.47.2
111
+ sympy==1.14.0
112
+ threadpoolctl==3.6.0
113
+ tokenizers==0.21.4
114
+ tomlkit==0.13.3
115
+ torch==2.7.1
116
+ torchvision==0.22.1
117
+ tqdm==4.67.1
118
+ transformers==4.52.4
119
+ typer==0.16.1
120
+ typing-inspection==0.4.1
121
+ typing_extensions==4.14.1
122
+ tzdata==2025.2
123
+ urllib3==2.5.0
124
+ uvicorn==0.35.0
125
+ websockets==15.0.1
126
+ win32_setctime==1.2.0
127
+ xlrd==2.0.2
128
+ xlsxwriter==3.2.5
129
+ xxhash==3.5.0
130
+ yahoo-finance==1.4.0
131
+ yarl==1.20.1
132
+ youtube-transcript-api==1.1.0
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.10.0
scripts/__pycache__/cookies.cpython-310.pyc ADDED
Binary file (10.1 kB). View file
 
scripts/__pycache__/cvedb_tool.cpython-310.pyc ADDED
Binary file (7.29 kB). View file
 
scripts/__pycache__/epss_tool.cpython-310.pyc ADDED
Binary file (4.02 kB). View file
 
scripts/__pycache__/hf_tools.cpython-310.pyc ADDED
Binary file (29 kB). View file
 
scripts/__pycache__/kevin_tool.cpython-310.pyc ADDED
Binary file (8.45 kB). View file
 
scripts/__pycache__/mdconvert.cpython-310.pyc ADDED
Binary file (25.6 kB). View file
 
scripts/__pycache__/nvd_tool.cpython-310.pyc ADDED
Binary file (11.3 kB). View file
 
scripts/__pycache__/report_generator.cpython-310.pyc ADDED
Binary file (7.71 kB). View file
 
scripts/__pycache__/text_inspector_tool.cpython-310.pyc ADDED
Binary file (3.24 kB). View file
 
scripts/__pycache__/text_web_browser.cpython-310.pyc ADDED
Binary file (17.6 kB). View file
 
scripts/__pycache__/visual_qa.cpython-310.pyc ADDED
Binary file (3.45 kB). View file
 
scripts/cookies.py ADDED
@@ -0,0 +1,715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from requests.cookies import RequestsCookieJar
2
+
3
+
4
+ COOKIES_LIST = [
5
+ {
6
+ "domain": ".youtube.com",
7
+ "expirationDate": 1718884961,
8
+ "hostOnly": False,
9
+ "httpOnly": False,
10
+ "name": "ST-xuwub9",
11
+ "path": "/",
12
+ "sameSite": None,
13
+ "secure": False,
14
+ "session": False,
15
+ "storeId": None,
16
+ "value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
17
+ },
18
+ {
19
+ "domain": ".youtube.com",
20
+ "expirationDate": 1753004444.745411,
21
+ "hostOnly": False,
22
+ "httpOnly": True,
23
+ "name": "__Secure-YEC",
24
+ "path": "/",
25
+ "sameSite": "lax",
26
+ "secure": True,
27
+ "session": False,
28
+ "storeId": None,
29
+ "value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk",
30
+ },
31
+ {
32
+ "domain": ".youtube.com",
33
+ "expirationDate": 1753434620.050824,
34
+ "hostOnly": False,
35
+ "httpOnly": True,
36
+ "name": "__Secure-3PSID",
37
+ "path": "/",
38
+ "sameSite": "no_restriction",
39
+ "secure": True,
40
+ "session": False,
41
+ "storeId": None,
42
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076",
43
+ },
44
+ {
45
+ "domain": ".youtube.com",
46
+ "expirationDate": 1750420959.974642,
47
+ "hostOnly": False,
48
+ "httpOnly": False,
49
+ "name": "SIDCC",
50
+ "path": "/",
51
+ "sameSite": None,
52
+ "secure": False,
53
+ "session": False,
54
+ "storeId": None,
55
+ "value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw",
56
+ },
57
+ {
58
+ "domain": ".youtube.com",
59
+ "expirationDate": 1753434620.050652,
60
+ "hostOnly": False,
61
+ "httpOnly": False,
62
+ "name": "SID",
63
+ "path": "/",
64
+ "sameSite": None,
65
+ "secure": False,
66
+ "session": False,
67
+ "storeId": None,
68
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076",
69
+ },
70
+ {
71
+ "domain": ".youtube.com",
72
+ "expirationDate": 1750420958.397534,
73
+ "hostOnly": False,
74
+ "httpOnly": True,
75
+ "name": "__Secure-1PSIDTS",
76
+ "path": "/",
77
+ "sameSite": None,
78
+ "secure": True,
79
+ "session": False,
80
+ "storeId": None,
81
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
82
+ },
83
+ {
84
+ "domain": ".youtube.com",
85
+ "expirationDate": 1753433494.44729,
86
+ "hostOnly": False,
87
+ "httpOnly": False,
88
+ "name": "_ga_M0180HEFCY",
89
+ "path": "/",
90
+ "sameSite": None,
91
+ "secure": False,
92
+ "session": False,
93
+ "storeId": None,
94
+ "value": "GS1.1.1718871908.1.0.1718873494.0.0.0",
95
+ },
96
+ {
97
+ "domain": ".youtube.com",
98
+ "expirationDate": 1753434620.050933,
99
+ "hostOnly": False,
100
+ "httpOnly": False,
101
+ "name": "SAPISID",
102
+ "path": "/",
103
+ "sameSite": None,
104
+ "secure": True,
105
+ "session": False,
106
+ "storeId": None,
107
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
108
+ },
109
+ {
110
+ "domain": ".youtube.com",
111
+ "expirationDate": 1750420959.974764,
112
+ "hostOnly": False,
113
+ "httpOnly": True,
114
+ "name": "__Secure-1PSIDCC",
115
+ "path": "/",
116
+ "sameSite": None,
117
+ "secure": True,
118
+ "session": False,
119
+ "storeId": None,
120
+ "value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK",
121
+ },
122
+ {
123
+ "domain": ".youtube.com",
124
+ "expirationDate": 1753434620.050881,
125
+ "hostOnly": False,
126
+ "httpOnly": True,
127
+ "name": "SSID",
128
+ "path": "/",
129
+ "sameSite": None,
130
+ "secure": True,
131
+ "session": False,
132
+ "storeId": None,
133
+ "value": "AmlwXHnQvOQ10LVd-",
134
+ },
135
+ {
136
+ "domain": ".youtube.com",
137
+ "expirationDate": 1753434620.050959,
138
+ "hostOnly": False,
139
+ "httpOnly": False,
140
+ "name": "__Secure-1PAPISID",
141
+ "path": "/",
142
+ "sameSite": None,
143
+ "secure": True,
144
+ "session": False,
145
+ "storeId": None,
146
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
147
+ },
148
+ {
149
+ "domain": ".youtube.com",
150
+ "expirationDate": 1753434620.050795,
151
+ "hostOnly": False,
152
+ "httpOnly": True,
153
+ "name": "__Secure-1PSID",
154
+ "path": "/",
155
+ "sameSite": None,
156
+ "secure": True,
157
+ "session": False,
158
+ "storeId": None,
159
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076",
160
+ },
161
+ {
162
+ "domain": ".youtube.com",
163
+ "expirationDate": 1753434620.050993,
164
+ "hostOnly": False,
165
+ "httpOnly": False,
166
+ "name": "__Secure-3PAPISID",
167
+ "path": "/",
168
+ "sameSite": "no_restriction",
169
+ "secure": True,
170
+ "session": False,
171
+ "storeId": None,
172
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
173
+ },
174
+ {
175
+ "domain": ".youtube.com",
176
+ "expirationDate": 1750420959.974815,
177
+ "hostOnly": False,
178
+ "httpOnly": True,
179
+ "name": "__Secure-3PSIDCC",
180
+ "path": "/",
181
+ "sameSite": "no_restriction",
182
+ "secure": True,
183
+ "session": False,
184
+ "storeId": None,
185
+ "value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg",
186
+ },
187
+ {
188
+ "domain": ".youtube.com",
189
+ "expirationDate": 1750420958.397647,
190
+ "hostOnly": False,
191
+ "httpOnly": True,
192
+ "name": "__Secure-3PSIDTS",
193
+ "path": "/",
194
+ "sameSite": "no_restriction",
195
+ "secure": True,
196
+ "session": False,
197
+ "storeId": None,
198
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
199
+ },
200
+ {
201
+ "domain": ".youtube.com",
202
+ "expirationDate": 1753434620.050908,
203
+ "hostOnly": False,
204
+ "httpOnly": False,
205
+ "name": "APISID",
206
+ "path": "/",
207
+ "sameSite": None,
208
+ "secure": False,
209
+ "session": False,
210
+ "storeId": None,
211
+ "value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk",
212
+ },
213
+ {
214
+ "domain": ".youtube.com",
215
+ "expirationDate": 1753434620.050855,
216
+ "hostOnly": False,
217
+ "httpOnly": True,
218
+ "name": "HSID",
219
+ "path": "/",
220
+ "sameSite": None,
221
+ "secure": False,
222
+ "session": False,
223
+ "storeId": None,
224
+ "value": "AasA7hmRuTFv7vjoq",
225
+ },
226
+ {
227
+ "domain": ".youtube.com",
228
+ "expirationDate": 1753435873.577793,
229
+ "hostOnly": False,
230
+ "httpOnly": True,
231
+ "name": "LOGIN_INFO",
232
+ "path": "/",
233
+ "sameSite": "no_restriction",
234
+ "secure": True,
235
+ "session": False,
236
+ "storeId": None,
237
+ "value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
238
+ },
239
+ {
240
+ "domain": ".youtube.com",
241
+ "expirationDate": 1753444956.555608,
242
+ "hostOnly": False,
243
+ "httpOnly": False,
244
+ "name": "PREF",
245
+ "path": "/",
246
+ "sameSite": None,
247
+ "secure": True,
248
+ "session": False,
249
+ "storeId": None,
250
+ "value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100",
251
+ },
252
+ ]
253
+
254
+ COOKIES_LIST += [
255
+ {
256
+ "domain": ".www.researchgate.net",
257
+ "hostOnly": False,
258
+ "httpOnly": True,
259
+ "name": "isInstIp",
260
+ "path": "/",
261
+ "sameSite": None,
262
+ "secure": True,
263
+ "session": True,
264
+ "storeId": None,
265
+ "value": "False",
266
+ },
267
+ {
268
+ "domain": ".researchgate.net",
269
+ "expirationDate": 1734423981,
270
+ "hostOnly": False,
271
+ "httpOnly": False,
272
+ "name": "__eoi",
273
+ "path": "/",
274
+ "sameSite": None,
275
+ "secure": False,
276
+ "session": False,
277
+ "storeId": None,
278
+ "value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc",
279
+ },
280
+ {
281
+ "domain": ".www.researchgate.net",
282
+ "expirationDate": 1753444909.646103,
283
+ "hostOnly": False,
284
+ "httpOnly": True,
285
+ "name": "ptc",
286
+ "path": "/",
287
+ "sameSite": None,
288
+ "secure": True,
289
+ "session": False,
290
+ "storeId": None,
291
+ "value": "RG1.8947708639250500550.1718872043",
292
+ },
293
+ {
294
+ "domain": ".researchgate.net",
295
+ "expirationDate": 1750507578,
296
+ "hostOnly": False,
297
+ "httpOnly": False,
298
+ "name": "euconsent-v2-didomi",
299
+ "path": "/",
300
+ "sameSite": "lax",
301
+ "secure": True,
302
+ "session": False,
303
+ "storeId": None,
304
+ "value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA",
305
+ },
306
+ {
307
+ "domain": ".researchgate.net",
308
+ "expirationDate": 1718885236,
309
+ "hostOnly": False,
310
+ "httpOnly": False,
311
+ "name": "_gat",
312
+ "path": "/",
313
+ "sameSite": None,
314
+ "secure": False,
315
+ "session": False,
316
+ "storeId": None,
317
+ "value": "1",
318
+ },
319
+ {
320
+ "domain": "www.researchgate.net",
321
+ "expirationDate": 1721477183,
322
+ "hostOnly": True,
323
+ "httpOnly": False,
324
+ "name": "_pbjs_userid_consent_data",
325
+ "path": "/",
326
+ "sameSite": "lax",
327
+ "secure": False,
328
+ "session": False,
329
+ "storeId": None,
330
+ "value": "3524755945110770",
331
+ },
332
+ {
333
+ "domain": ".researchgate.net",
334
+ "expirationDate": 1752567981,
335
+ "hostOnly": False,
336
+ "httpOnly": False,
337
+ "name": "__gads",
338
+ "path": "/",
339
+ "sameSite": None,
340
+ "secure": False,
341
+ "session": False,
342
+ "storeId": None,
343
+ "value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ",
344
+ },
345
+ {
346
+ "domain": ".researchgate.net",
347
+ "expirationDate": 1718886709.646173,
348
+ "hostOnly": False,
349
+ "httpOnly": True,
350
+ "name": "__cf_bm",
351
+ "path": "/",
352
+ "sameSite": "no_restriction",
353
+ "secure": True,
354
+ "session": False,
355
+ "storeId": None,
356
+ "value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA",
357
+ },
358
+ {
359
+ "domain": ".researchgate.net",
360
+ "expirationDate": 1752567981,
361
+ "hostOnly": False,
362
+ "httpOnly": False,
363
+ "name": "__gpi",
364
+ "path": "/",
365
+ "sameSite": None,
366
+ "secure": False,
367
+ "session": False,
368
+ "storeId": None,
369
+ "value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg",
370
+ },
371
+ {
372
+ "domain": ".researchgate.net",
373
+ "hostOnly": False,
374
+ "httpOnly": True,
375
+ "name": "_cfuvid",
376
+ "path": "/",
377
+ "sameSite": "no_restriction",
378
+ "secure": True,
379
+ "session": True,
380
+ "storeId": None,
381
+ "value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000",
382
+ },
383
+ {
384
+ "domain": ".researchgate.net",
385
+ "expirationDate": 1753445177.271667,
386
+ "hostOnly": False,
387
+ "httpOnly": False,
388
+ "name": "_ga",
389
+ "path": "/",
390
+ "sameSite": None,
391
+ "secure": False,
392
+ "session": False,
393
+ "storeId": None,
394
+ "value": "GA1.1.1525244793.1718885177",
395
+ },
396
+ {
397
+ "domain": ".researchgate.net",
398
+ "expirationDate": 1753445177.271482,
399
+ "hostOnly": False,
400
+ "httpOnly": False,
401
+ "name": "_ga_4P31SJ70EJ",
402
+ "path": "/",
403
+ "sameSite": None,
404
+ "secure": False,
405
+ "session": False,
406
+ "storeId": None,
407
+ "value": "GS1.1.1718885177.1.0.1718885177.0.0.0",
408
+ },
409
+ {
410
+ "domain": ".researchgate.net",
411
+ "expirationDate": 1718971576,
412
+ "hostOnly": False,
413
+ "httpOnly": False,
414
+ "name": "_gid",
415
+ "path": "/",
416
+ "sameSite": None,
417
+ "secure": False,
418
+ "session": False,
419
+ "storeId": None,
420
+ "value": "GA1.2.854907463.1718885177",
421
+ },
422
+ {
423
+ "domain": ".www.researchgate.net",
424
+ "expirationDate": 1750407982.506505,
425
+ "hostOnly": False,
426
+ "httpOnly": True,
427
+ "name": "did",
428
+ "path": "/",
429
+ "sameSite": None,
430
+ "secure": True,
431
+ "session": False,
432
+ "storeId": None,
433
+ "value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH",
434
+ },
435
+ {
436
+ "domain": ".researchgate.net",
437
+ "expirationDate": 1750507578,
438
+ "hostOnly": False,
439
+ "httpOnly": False,
440
+ "name": "didomi_token",
441
+ "path": "/",
442
+ "sameSite": "lax",
443
+ "secure": True,
444
+ "session": False,
445
+ "storeId": None,
446
+ "value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9",
447
+ },
448
+ {
449
+ "domain": ".www.researchgate.net",
450
+ "hostOnly": False,
451
+ "httpOnly": True,
452
+ "name": "hasPdpNext",
453
+ "path": "/",
454
+ "sameSite": None,
455
+ "secure": True,
456
+ "session": True,
457
+ "storeId": None,
458
+ "value": "False",
459
+ },
460
+ {
461
+ "domain": ".researchgate.net",
462
+ "expirationDate": 1750421183,
463
+ "hostOnly": False,
464
+ "httpOnly": False,
465
+ "name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog",
466
+ "path": "/",
467
+ "sameSite": "lax",
468
+ "secure": True,
469
+ "session": False,
470
+ "storeId": None,
471
+ "value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D",
472
+ },
473
+ {
474
+ "domain": ".www.researchgate.net",
475
+ "hostOnly": False,
476
+ "httpOnly": True,
477
+ "name": "sid",
478
+ "path": "/",
479
+ "sameSite": None,
480
+ "secure": True,
481
+ "session": True,
482
+ "storeId": None,
483
+ "value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ",
484
+ },
485
+ ]
486
+
487
+ COOKIES_LIST += [
488
+ {
489
+ "domain": "github.com",
490
+ "hostOnly": True,
491
+ "httpOnly": True,
492
+ "name": "_gh_sess",
493
+ "path": "/",
494
+ "sameSite": "lax",
495
+ "secure": True,
496
+ "session": True,
497
+ "storeId": None,
498
+ "value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D",
499
+ },
500
+ {
501
+ "domain": ".github.com",
502
+ "expirationDate": 1750408875.763785,
503
+ "hostOnly": False,
504
+ "httpOnly": False,
505
+ "name": "_octo",
506
+ "path": "/",
507
+ "sameSite": "lax",
508
+ "secure": True,
509
+ "session": False,
510
+ "storeId": None,
511
+ "value": "GH1.1.728652011.1718872875",
512
+ },
513
+ {
514
+ "domain": ".github.com",
515
+ "expirationDate": 1750408875.763926,
516
+ "hostOnly": False,
517
+ "httpOnly": True,
518
+ "name": "logged_in",
519
+ "path": "/",
520
+ "sameSite": "lax",
521
+ "secure": True,
522
+ "session": False,
523
+ "storeId": None,
524
+ "value": "no",
525
+ },
526
+ {
527
+ "domain": ".github.com",
528
+ "hostOnly": False,
529
+ "httpOnly": False,
530
+ "name": "preferred_color_mode",
531
+ "path": "/",
532
+ "sameSite": "lax",
533
+ "secure": True,
534
+ "session": True,
535
+ "storeId": None,
536
+ "value": "dark",
537
+ },
538
+ {
539
+ "domain": ".github.com",
540
+ "hostOnly": False,
541
+ "httpOnly": False,
542
+ "name": "tz",
543
+ "path": "/",
544
+ "sameSite": "lax",
545
+ "secure": True,
546
+ "session": True,
547
+ "storeId": None,
548
+ "value": "Europe%2FParis",
549
+ },
550
+ ]
551
+
552
+ COOKIES_LIST += [
553
+ {
554
+ "domain": ".web.archive.org",
555
+ "expirationDate": 1718886430,
556
+ "hostOnly": False,
557
+ "httpOnly": False,
558
+ "name": "_gat",
559
+ "path": "/web/20201123221659/http://orcid.org/",
560
+ "sameSite": None,
561
+ "secure": False,
562
+ "session": False,
563
+ "storeId": None,
564
+ "value": "1",
565
+ },
566
+ {
567
+ "domain": ".web.archive.org",
568
+ "expirationDate": 1718972770,
569
+ "hostOnly": False,
570
+ "httpOnly": False,
571
+ "name": "_gid",
572
+ "path": "/web/20201123221659/http://orcid.org/",
573
+ "sameSite": None,
574
+ "secure": False,
575
+ "session": False,
576
+ "storeId": None,
577
+ "value": "GA1.2.402246368.1606169825",
578
+ },
579
+ {
580
+ "domain": ".web.archive.org",
581
+ "expirationDate": 1753446370.315621,
582
+ "hostOnly": False,
583
+ "httpOnly": False,
584
+ "name": "_ga",
585
+ "path": "/web/20201123221659/http://orcid.org/",
586
+ "sameSite": None,
587
+ "secure": False,
588
+ "session": False,
589
+ "storeId": None,
590
+ "value": "GA1.2.1301409987.1606169825",
591
+ },
592
+ {
593
+ "domain": ".web.archive.org",
594
+ "expirationDate": 1750422367,
595
+ "hostOnly": False,
596
+ "httpOnly": False,
597
+ "name": "_hjid",
598
+ "path": "/web/20201123221659/http://orcid.org/",
599
+ "sameSite": "lax",
600
+ "secure": False,
601
+ "session": False,
602
+ "storeId": None,
603
+ "value": "07f80263-a631-4bf4-8ffd-8fc8912085e2",
604
+ },
605
+ {
606
+ "domain": ".web.archive.org",
607
+ "expirationDate": 1718888167,
608
+ "hostOnly": False,
609
+ "httpOnly": False,
610
+ "name": "_hjFirstSeen",
611
+ "path": "/web/20201123221659/http://orcid.org/",
612
+ "sameSite": "lax",
613
+ "secure": False,
614
+ "session": False,
615
+ "storeId": None,
616
+ "value": "1",
617
+ },
618
+ ]
619
+ COOKIES_LIST += [
620
+ {
621
+ "domain": "orcid.org",
622
+ "hostOnly": True,
623
+ "httpOnly": False,
624
+ "name": "AWSELBCORS",
625
+ "path": "/",
626
+ "sameSite": "no_restriction",
627
+ "secure": True,
628
+ "session": True,
629
+ "storeId": None,
630
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
631
+ },
632
+ {
633
+ "domain": ".orcid.org",
634
+ "expirationDate": 1753452454.637671,
635
+ "hostOnly": False,
636
+ "httpOnly": False,
637
+ "name": "_ga_9R61FWK9H5",
638
+ "path": "/",
639
+ "sameSite": None,
640
+ "secure": False,
641
+ "session": False,
642
+ "storeId": None,
643
+ "value": "GS1.1.1718892454.1.0.1718892454.0.0.0",
644
+ },
645
+ {
646
+ "domain": ".orcid.org",
647
+ "expirationDate": 1753452454.63421,
648
+ "hostOnly": False,
649
+ "httpOnly": False,
650
+ "name": "_ga",
651
+ "path": "/",
652
+ "sameSite": None,
653
+ "secure": False,
654
+ "session": False,
655
+ "storeId": None,
656
+ "value": "GA1.1.2021310691.1718892455",
657
+ },
658
+ {
659
+ "domain": "orcid.org",
660
+ "hostOnly": True,
661
+ "httpOnly": False,
662
+ "name": "AWSELB",
663
+ "path": "/",
664
+ "sameSite": None,
665
+ "secure": False,
666
+ "session": True,
667
+ "storeId": None,
668
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
669
+ },
670
+ {
671
+ "domain": ".orcid.org",
672
+ "expirationDate": 1750428454,
673
+ "hostOnly": False,
674
+ "httpOnly": False,
675
+ "name": "OptanonAlertBoxClosed",
676
+ "path": "/",
677
+ "sameSite": "lax",
678
+ "secure": False,
679
+ "session": False,
680
+ "storeId": None,
681
+ "value": "2024-06-20T14:07:34.583Z",
682
+ },
683
+ {
684
+ "domain": ".orcid.org",
685
+ "expirationDate": 1750428454,
686
+ "hostOnly": False,
687
+ "httpOnly": False,
688
+ "name": "OptanonConsent",
689
+ "path": "/",
690
+ "sameSite": "lax",
691
+ "secure": False,
692
+ "session": False,
693
+ "storeId": None,
694
+ "value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1",
695
+ },
696
+ {
697
+ "domain": "orcid.org",
698
+ "hostOnly": True,
699
+ "httpOnly": False,
700
+ "name": "XSRF-TOKEN",
701
+ "path": "/",
702
+ "sameSite": None,
703
+ "secure": True,
704
+ "session": True,
705
+ "storeId": None,
706
+ "value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9",
707
+ },
708
+ ]
709
+
710
+ # Create a RequestsCookieJar instance
711
+ COOKIES = RequestsCookieJar()
712
+
713
+ # Add cookies to the jar
714
+ for cookie in COOKIES_LIST:
715
+ COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"])
scripts/gaia_scorer.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import warnings
4
+
5
+
6
+ def normalize_number_str(number_str: str) -> float:
7
+ # we replace these common units and commas to allow
8
+ # conversion to float
9
+ for char in ["$", "%", ","]:
10
+ number_str = number_str.replace(char, "")
11
+ try:
12
+ return float(number_str)
13
+ except ValueError:
14
+ print(f"String {number_str} cannot be normalized to number str.")
15
+ return float("inf")
16
+
17
+
18
+ def split_string(
19
+ s: str,
20
+ char_list: list[str] = [",", ";"],
21
+ ) -> list[str]:
22
+ pattern = f"[{''.join(char_list)}]"
23
+ return re.split(pattern, s)
24
+
25
+
26
+ def is_float(element: any) -> bool:
27
+ try:
28
+ float(element)
29
+ return True
30
+ except ValueError:
31
+ return False
32
+
33
+
34
+ def question_scorer(
35
+ model_answer: str,
36
+ ground_truth: str,
37
+ ) -> bool:
38
+ # if gt is a number
39
+ if is_float(ground_truth):
40
+ normalized_answer = normalize_number_str(str(model_answer))
41
+ return normalized_answer == float(ground_truth)
42
+
43
+ # if gt is a list
44
+ elif any(char in ground_truth for char in [",", ";"]):
45
+ # question with the fish: normalization removes punct
46
+
47
+ gt_elems = split_string(ground_truth)
48
+ ma_elems = split_string(model_answer)
49
+
50
+ # check length is the same
51
+ if len(gt_elems) != len(ma_elems):
52
+ warnings.warn("Answer lists have different lengths, returning False.", UserWarning)
53
+ return False
54
+
55
+ # compare each element as float or str
56
+ comparisons = []
57
+ for ma_elem, gt_elem in zip(ma_elems, gt_elems):
58
+ if is_float(gt_elem):
59
+ normalized_ma_elem = normalize_number_str(ma_elem)
60
+ comparisons.append(normalized_ma_elem == float(gt_elem))
61
+ else:
62
+ # we do not remove punct since comparisons can include punct
63
+ comparisons.append(
64
+ normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False)
65
+ )
66
+ return all(comparisons)
67
+
68
+ # if gt is a str
69
+ else:
70
+ return normalize_str(model_answer) == normalize_str(ground_truth)
71
+
72
+
73
+ def check_prediction_contains_answer_letters_in_order(prediction, true_answer):
74
+ prediction = prediction.lower()
75
+ true_answer = true_answer.lower()
76
+ if len(prediction) > len(true_answer) * 3:
77
+ return False
78
+ i = 0
79
+ for letter in true_answer:
80
+ if letter in prediction[i:]:
81
+ i += prediction[i:].index(letter)
82
+ else:
83
+ return False
84
+ return True
85
+
86
+
87
+ def check_close_call(prediction, true_answer, is_correct):
88
+ if is_correct:
89
+ return True
90
+ else:
91
+ if is_float(true_answer):
92
+ return is_correct
93
+ else:
94
+ if (
95
+ check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer))
96
+ and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer)) * 2
97
+ ):
98
+ print(f"Close call: {prediction} vs {true_answer}")
99
+ return True
100
+ else:
101
+ return False
102
+
103
+
104
+ def normalize_str(input_str, remove_punct=True) -> str:
105
+ """
106
+ Normalize a string by:
107
+ - Removing all white spaces
108
+ - Optionally removing punctuation (if remove_punct is True)
109
+ - Converting to lowercase
110
+ Parameters:
111
+ - input_str: str, the string to normalize
112
+ - remove_punct: bool, whether to remove punctuation (default: True)
113
+ Returns:
114
+ - str, the normalized string
115
+ """
116
+ # Remove all white spaces. Required e.g for seagull vs. sea gull
117
+ no_spaces = re.sub(r"\s", "", input_str)
118
+
119
+ # Remove punctuation, if specified.
120
+ if remove_punct:
121
+ translator = str.maketrans("", "", string.punctuation)
122
+ return no_spaces.lower().translate(translator)
123
+ else:
124
+ return no_spaces.lower()
scripts/hf_tools.py ADDED
@@ -0,0 +1,867 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import time
3
+ from typing import Dict, List, Optional, Tuple
4
+
5
+ import requests
6
+ from smolagents import Tool
7
+
8
+
9
+ # -----------------------------
10
+ # HTTP helpers (anonymous only)
11
+ # -----------------------------
12
+
13
+ DEFAULT_TIMEOUT = 15
14
+ RETRY_STATUS = {429, 500, 502, 503, 504}
15
+
16
+
17
+ def _anonymous_headers() -> Dict[str, str]:
18
+ return {
19
+ "Accept": "application/json",
20
+ "User-Agent": "HuggingResearch-Agent/1.0 (+https://huggingface.co)",
21
+ # No Authorization header on purpose (public only)
22
+ }
23
+
24
+
25
+ def _http_get_json(url: str, params: Optional[Dict] = None, max_retries: int = 2) -> Tuple[Optional[Dict | List], int, str]:
26
+ last_err = ""
27
+ for attempt in range(max_retries + 1):
28
+ try:
29
+ resp = requests.get(url, params=params or {}, headers=_anonymous_headers(), timeout=DEFAULT_TIMEOUT)
30
+ if resp.status_code == 200:
31
+ try:
32
+ return resp.json(), resp.status_code, ""
33
+ except Exception as je:
34
+ return None, resp.status_code, f"invalid_json: {je}"
35
+ if resp.status_code in {401, 403}:
36
+ # Private/Gated/Unauthorized
37
+ return None, resp.status_code, "no_access"
38
+ if resp.status_code in RETRY_STATUS and attempt < max_retries:
39
+ time.sleep(1.0 * (attempt + 1))
40
+ continue
41
+ return None, resp.status_code, f"http_{resp.status_code}"
42
+ except Exception as e:
43
+ last_err = str(e)
44
+ if attempt < max_retries:
45
+ time.sleep(0.8 * (attempt + 1))
46
+ continue
47
+ return None, 0, f"exception: {last_err}"
48
+
49
+
50
+ # -----------------------------
51
+ # Normalization helpers
52
+ # -----------------------------
53
+
54
+ def _visibility_from_item(item: Dict) -> Tuple[str, str]:
55
+ if not isinstance(item, dict):
56
+ return "public", "accessible"
57
+ if item.get("private") is True:
58
+ return "private", "no_access"
59
+ if item.get("gated") is True or item.get("gatedReason") or (isinstance(item.get("cardData"), dict) and item["cardData"].get("gated")):
60
+ return "gated", "no_access"
61
+ return "public", "accessible"
62
+
63
+
64
+ def _norm_common(item_id: str, item_type: str, owner: str, description: str = "", url_suffix: str = "") -> Dict:
65
+ url = f"https://huggingface.co/{url_suffix}{item_id}" if url_suffix else f"https://huggingface.co/{item_id}"
66
+ return {
67
+ "type": item_type,
68
+ "id": item_id,
69
+ "owner": owner,
70
+ "url": url,
71
+ "description": description or "",
72
+ }
73
+
74
+
75
+ def _safe_get(item: Dict, key: str, default=None):
76
+ return item.get(key, default) if isinstance(item, dict) else default
77
+
78
+
79
+ # -----------------------------
80
+ # Tools
81
+ # -----------------------------
82
+
83
+
84
+ class HFModelsSearchTool(Tool):
85
+ name = "hf_models_search"
86
+ description = (
87
+ "Search public Hugging Face models. Provide a free-text query and optional filters "
88
+ "(owner, single pipeline_tag, tags CSV, sort/direction, limit). "
89
+ "Prefer minimal params; add owner/task/tags/sort only when the user implies them. "
90
+ "Defaults: limit=10, sort omitted, direction omitted. Returns JSON with `results`, `status`, `error`, and `params`."
91
+ )
92
+ inputs = {
93
+ "query": {"type": "string", "description": "Free-text search", "nullable": True},
94
+ "owner": {"type": "string", "description": "Filter by owner/namespace", "nullable": True},
95
+ "task": {"type": "string", "description": "Primary pipeline tag, e.g. text-classification", "nullable": True},
96
+ "tags": {"type": "string", "description": "Comma-separated tags filter", "nullable": True},
97
+ "sort": {"type": "string", "description": "downloads|likes|modified", "nullable": True},
98
+ "direction": {"type": "string", "description": "descending|ascending", "nullable": True},
99
+ "limit": {"type": "number", "description": "Max results", "nullable": True},
100
+ }
101
+ output_type = "string"
102
+
103
+ def forward(self, query: Optional[str] = None, owner: Optional[str] = None, task: Optional[str] = None, tags: Optional[str] = None, sort: Optional[str] = None, direction: Optional[str] = None, limit: Optional[int] = None) -> str:
104
+ # Build conservative params
105
+ params = {}
106
+ if query:
107
+ params["search"] = query
108
+ if owner:
109
+ params["author"] = owner
110
+ if task:
111
+ # pipeline_tag must be a single value; if multiple provided, take the first
112
+ first_task = task.split(",")[0].strip()
113
+ if first_task:
114
+ params["pipeline_tag"] = first_task
115
+ if tags:
116
+ # Support comma-separated → repeated tags
117
+ tag_list = [t.strip() for t in tags.split(",") if t.strip()]
118
+ if len(tag_list) == 1:
119
+ params["tags"] = tag_list[0]
120
+ elif len(tag_list) > 1:
121
+ params["tags"] = tag_list # requests will repeat param
122
+ # Support 'trending' as an alias mapped to downloads+descending for recency/interest
123
+ if sort in {"downloads", "likes", "modified", "trending"}:
124
+ params["sort"] = sort
125
+ if sort == "trending":
126
+ params["sort"] = "downloads"
127
+ params["direction"] = "descending"
128
+ elif direction in {"descending", "ascending"}:
129
+ params["direction"] = direction
130
+ # Default limit to 10 if not specified
131
+ lim = int(limit) if limit else 10
132
+ params["limit"] = lim
133
+
134
+ data, status, err = _http_get_json("https://huggingface.co/api/models", params)
135
+ # Fallback: retry with minimal params if 400
136
+ if status == 400:
137
+ minimal = {"search": query} if query else {}
138
+ if limit:
139
+ minimal["limit"] = int(limit)
140
+ data, status, err = _http_get_json("https://huggingface.co/api/models", minimal)
141
+ results: List[Dict] = []
142
+ if isinstance(data, list):
143
+ for it in data:
144
+ model_id = _safe_get(it, "id") or _safe_get(it, "modelId") or ""
145
+ if not model_id:
146
+ continue
147
+ owner_name = model_id.split("/")[0] if "/" in model_id else ""
148
+ desc = ""
149
+ # If present, short description may live in cardData/summary when full=true; not guaranteed in list
150
+ visibility, access = _visibility_from_item(it)
151
+ norm = _norm_common(model_id, "model", owner_name, desc)
152
+ norm.update({
153
+ "tags": _safe_get(it, "tags", []),
154
+ "task": _safe_get(it, "pipeline_tag"),
155
+ "likes": _safe_get(it, "likes", 0),
156
+ "downloads": _safe_get(it, "downloads", 0),
157
+ "updatedAt": _safe_get(it, "lastModified"),
158
+ "visibility": visibility,
159
+ "access": access,
160
+ })
161
+ results.append(norm)
162
+
163
+ return json.dumps({
164
+ "results": results,
165
+ "status": status,
166
+ "error": err,
167
+ "params": params,
168
+ }, ensure_ascii=False)
169
+
170
+
171
+ class HFModelInfoTool(Tool):
172
+ name = "hf_model_info"
173
+ description = (
174
+ "Get detailed public model info by repo id (owner/name). Use this after a model search to fetch richer metadata (cardData, siblings, tags)."
175
+ )
176
+ inputs = {
177
+ "repo_id": {"type": "string", "description": "Model repo id, e.g. bigscience/bloom"}
178
+ }
179
+ output_type = "string"
180
+
181
+ def forward(self, repo_id: str) -> str:
182
+ data, status, err = _http_get_json(f"https://huggingface.co/api/models/{repo_id}", {"full": "true"})
183
+ item: Dict = {}
184
+ if isinstance(data, dict):
185
+ model_id = data.get("id") or data.get("modelId") or repo_id
186
+ owner_name = model_id.split("/")[0] if "/" in model_id else ""
187
+ visibility, access = _visibility_from_item(data)
188
+ desc = ""
189
+ # Some cards put a short summary in cardData/summary
190
+ if isinstance(data.get("cardData"), dict):
191
+ desc = data["cardData"].get("summary") or data["cardData"].get("description") or ""
192
+ item = _norm_common(model_id, "model", owner_name, desc)
193
+ item.update({
194
+ "tags": data.get("tags", []),
195
+ "task": data.get("pipeline_tag"),
196
+ "likes": data.get("likes", 0),
197
+ "downloads": data.get("downloads", 0),
198
+ "updatedAt": data.get("lastModified"),
199
+ "visibility": visibility,
200
+ "access": access,
201
+ "cardData": data.get("cardData"),
202
+ "siblings": data.get("siblings"),
203
+ })
204
+ return json.dumps({"item": item, "status": status, "error": err}, ensure_ascii=False)
205
+
206
+
207
+ class HFDatasetsSearchTool(Tool):
208
+ name = "hf_datasets_search"
209
+ description = (
210
+ "Search public datasets with a free-text query and optional filters (owner, tags CSV, sort/direction, limit). "
211
+ "Prefer minimal params; add filters when implied. Defaults: limit=10. Returns JSON with `results`, `status`, `error`, and `params`."
212
+ )
213
+ inputs = {
214
+ "query": {"type": "string", "description": "Free-text search", "nullable": True},
215
+ "owner": {"type": "string", "description": "Filter by owner/namespace", "nullable": True},
216
+ "tags": {"type": "string", "description": "Comma-separated tags filter", "nullable": True},
217
+ "sort": {"type": "string", "description": "downloads|likes|modified", "nullable": True},
218
+ "direction": {"type": "string", "description": "descending|ascending", "nullable": True},
219
+ "limit": {"type": "number", "description": "Max results", "nullable": True},
220
+ }
221
+ output_type = "string"
222
+
223
+ def forward(self, query: Optional[str] = None, owner: Optional[str] = None, tags: Optional[str] = None, sort: Optional[str] = None, direction: Optional[str] = None, limit: Optional[int] = None) -> str:
224
+ params = {}
225
+ if query:
226
+ params["search"] = query
227
+ if owner:
228
+ params["author"] = owner
229
+ if tags:
230
+ tag_list = [t.strip() for t in tags.split(",")] if isinstance(tags, str) else []
231
+ tag_list = [t for t in tag_list if t]
232
+ if len(tag_list) == 1:
233
+ params["tags"] = tag_list[0]
234
+ elif len(tag_list) > 1:
235
+ params["tags"] = tag_list
236
+ if sort in {"downloads", "likes", "modified", "trending"}:
237
+ params["sort"] = sort
238
+ if sort == "trending":
239
+ params["sort"] = "downloads"
240
+ params["direction"] = "descending"
241
+ elif direction in {"descending", "ascending"}:
242
+ params["direction"] = direction
243
+ lim = int(limit) if limit else 10
244
+ params["limit"] = lim
245
+
246
+ data, status, err = _http_get_json("https://huggingface.co/api/datasets", params)
247
+ if status == 400:
248
+ minimal = {"search": query} if query else {}
249
+ if limit:
250
+ minimal["limit"] = int(limit)
251
+ data, status, err = _http_get_json("https://huggingface.co/api/datasets", minimal)
252
+ results: List[Dict] = []
253
+ if isinstance(data, list):
254
+ for it in data:
255
+ ds_id = _safe_get(it, "id") or _safe_get(it, "datasetId") or ""
256
+ if not ds_id:
257
+ continue
258
+ owner_name = ds_id.split("/")[0] if "/" in ds_id else ""
259
+ visibility, access = _visibility_from_item(it)
260
+ norm = _norm_common(ds_id, "dataset", owner_name, "")
261
+ norm.update({
262
+ "tags": _safe_get(it, "tags", []),
263
+ "likes": _safe_get(it, "likes", 0),
264
+ "downloads": _safe_get(it, "downloads", 0),
265
+ "updatedAt": _safe_get(it, "lastModified"),
266
+ "visibility": visibility,
267
+ "access": access,
268
+ })
269
+ results.append(norm)
270
+ return json.dumps({"results": results, "status": status, "error": err, "params": params}, ensure_ascii=False)
271
+
272
+
273
+ class HFDatasetInfoTool(Tool):
274
+ name = "hf_dataset_info"
275
+ description = (
276
+ "Get detailed public dataset info by repo id (owner/name). Use after a dataset search to retrieve cardData and siblings."
277
+ )
278
+ inputs = {"repo_id": {"type": "string", "description": "Dataset repo id, e.g. glue"}}
279
+ output_type = "string"
280
+
281
+ def forward(self, repo_id: str) -> str:
282
+ data, status, err = _http_get_json(f"https://huggingface.co/api/datasets/{repo_id}", {"full": "true"})
283
+ item: Dict = {}
284
+ if isinstance(data, dict):
285
+ ds_id = data.get("id") or data.get("datasetId") or repo_id
286
+ owner_name = ds_id.split("/")[0] if "/" in ds_id else ""
287
+ visibility, access = _visibility_from_item(data)
288
+ desc = ""
289
+ if isinstance(data.get("cardData"), dict):
290
+ desc = data["cardData"].get("summary") or data["cardData"].get("description") or ""
291
+ item = _norm_common(ds_id, "dataset", owner_name, desc)
292
+ item.update({
293
+ "tags": data.get("tags", []),
294
+ "likes": data.get("likes", 0),
295
+ "downloads": data.get("downloads", 0),
296
+ "updatedAt": data.get("lastModified"),
297
+ "visibility": visibility,
298
+ "access": access,
299
+ "cardData": data.get("cardData"),
300
+ "siblings": data.get("siblings"),
301
+ })
302
+ return json.dumps({"item": item, "status": status, "error": err}, ensure_ascii=False)
303
+
304
+
305
+ class HFSpacesSearchTool(Tool):
306
+ name = "hf_spaces_search"
307
+ description = (
308
+ "Search public Spaces with query and optional filters (owner, tags CSV, sort/direction, limit). "
309
+ "Good for tutorials/demos related to a topic. Defaults: limit=10. Returns JSON with `results`, `status`, `error`, and `params`."
310
+ )
311
+ inputs = {
312
+ "query": {"type": "string", "description": "Free-text search", "nullable": True},
313
+ "owner": {"type": "string", "description": "Filter by owner/namespace", "nullable": True},
314
+ "tags": {"type": "string", "description": "Comma-separated tags filter", "nullable": True},
315
+ "sort": {"type": "string", "description": "likes|modified", "nullable": True},
316
+ "direction": {"type": "string", "description": "descending|ascending", "nullable": True},
317
+ "limit": {"type": "number", "description": "Max results", "nullable": True},
318
+ }
319
+ output_type = "string"
320
+
321
+ def forward(self, query: Optional[str] = None, owner: Optional[str] = None, tags: Optional[str] = None, sort: Optional[str] = None, direction: Optional[str] = None, limit: Optional[int] = None) -> str:
322
+ params = {}
323
+ if query:
324
+ params["search"] = query
325
+ if owner:
326
+ params["author"] = owner
327
+ if tags:
328
+ tag_list = [t.strip() for t in tags.split(",")] if isinstance(tags, str) else []
329
+ tag_list = [t for t in tag_list if t]
330
+ if len(tag_list) == 1:
331
+ params["tags"] = tag_list[0]
332
+ elif len(tag_list) > 1:
333
+ params["tags"] = tag_list
334
+ if sort in {"likes", "modified", "trending"}:
335
+ params["sort"] = sort
336
+ if sort == "trending":
337
+ params["sort"] = "likes"
338
+ params["direction"] = "descending"
339
+ elif direction in {"descending", "ascending"}:
340
+ params["direction"] = direction
341
+ lim = int(limit) if limit else 10
342
+ params["limit"] = lim
343
+
344
+ data, status, err = _http_get_json("https://huggingface.co/api/spaces", params)
345
+ if status == 400:
346
+ minimal = {"search": query} if query else {}
347
+ if limit:
348
+ minimal["limit"] = int(limit)
349
+ data, status, err = _http_get_json("https://huggingface.co/api/spaces", minimal)
350
+ results: List[Dict] = []
351
+ if isinstance(data, list):
352
+ for it in data:
353
+ sp_id = _safe_get(it, "id") or _safe_get(it, "spaceId") or ""
354
+ if not sp_id:
355
+ continue
356
+ owner_name = sp_id.split("/")[0] if "/" in sp_id else ""
357
+ visibility, access = _visibility_from_item(it)
358
+ norm = _norm_common(sp_id, "space", owner_name, "")
359
+ # Try to extract Space runtime (sdk, app file) when available in list
360
+ norm.update({
361
+ "tags": _safe_get(it, "tags", []),
362
+ "likes": _safe_get(it, "likes", 0),
363
+ "downloads": _safe_get(it, "downloads", 0),
364
+ "updatedAt": _safe_get(it, "lastModified"),
365
+ "visibility": visibility,
366
+ "access": access,
367
+ })
368
+ results.append(norm)
369
+ return json.dumps({"results": results, "status": status, "error": err, "params": params}, ensure_ascii=False)
370
+
371
+
372
+ class HFSpaceInfoTool(Tool):
373
+ name = "hf_space_info"
374
+ description = (
375
+ "Get detailed Space info by repo id (owner/name). Use to inspect tags, likes, and card details after a Space search."
376
+ )
377
+ inputs = {"repo_id": {"type": "string", "description": "Space repo id, e.g. user/space-name"}}
378
+ output_type = "string"
379
+
380
+ def forward(self, repo_id: str) -> str:
381
+ data, status, err = _http_get_json(f"https://huggingface.co/api/spaces/{repo_id}", {"full": "true"})
382
+ item: Dict = {}
383
+ if isinstance(data, dict):
384
+ sp_id = data.get("id") or data.get("spaceId") or repo_id
385
+ owner_name = sp_id.split("/")[0] if "/" in sp_id else ""
386
+ visibility, access = _visibility_from_item(data)
387
+ desc = ""
388
+ if isinstance(data.get("cardData"), dict):
389
+ desc = data["cardData"].get("summary") or data["cardData"].get("description") or ""
390
+ item = _norm_common(sp_id, "space", owner_name, desc)
391
+ item.update({
392
+ "tags": data.get("tags", []),
393
+ "likes": data.get("likes", 0),
394
+ "downloads": data.get("downloads", 0),
395
+ "updatedAt": data.get("lastModified"),
396
+ "visibility": visibility,
397
+ "access": access,
398
+ "cardData": data.get("cardData"),
399
+ "siblings": data.get("siblings"),
400
+ })
401
+ return json.dumps({"item": item, "status": status, "error": err}, ensure_ascii=False)
402
+
403
+
404
+ class HFUserInfoTool(Tool):
405
+ name = "hf_user_info"
406
+ description = (
407
+ "Fetch public user/org profile by username. Helpful to scope searches by owner or explore maintainers."
408
+ )
409
+ inputs = {"username": {"type": "string", "description": "User or organization name"}}
410
+ output_type = "string"
411
+
412
+ def forward(self, username: str) -> str:
413
+ data, status, err = _http_get_json(f"https://huggingface.co/api/users/{username}")
414
+ item = data if isinstance(data, dict) else {}
415
+ visibility = "public"
416
+ access = "accessible" if status == 200 else "no_access"
417
+ return json.dumps({"item": item, "status": status, "error": err, "visibility": visibility, "access": access}, ensure_ascii=False)
418
+
419
+
420
+ class HFCollectionsListTool(Tool):
421
+ name = "hf_collections_list"
422
+ description = (
423
+ "List public collections, optionally filtered by owner/namespace. Use to surface curated sets of repos. "
424
+ "Owner may be an object; URL is normalized to https://huggingface.co/collections/{owner_name}/{slug}."
425
+ )
426
+ inputs = {"owner": {"type": "string", "description": "Filter by collection owner/namespace", "nullable": True}}
427
+ output_type = "string"
428
+
429
+ def forward(self, owner: Optional[str] = None) -> str:
430
+ params = {}
431
+ if owner:
432
+ params["owner"] = owner
433
+ data, status, err = _http_get_json("https://huggingface.co/api/collections", params)
434
+ results = data if isinstance(data, list) else []
435
+ # Normalize minimally
436
+ items: List[Dict] = []
437
+ for it in results:
438
+ cid = _safe_get(it, "id") or _safe_get(it, "slug") or ""
439
+ ns_val = _safe_get(it, "owner") or _safe_get(it, "namespace") or ""
440
+ if isinstance(ns_val, dict):
441
+ ns = ns_val.get("name") or ns_val.get("fullname") or ""
442
+ else:
443
+ ns = ns_val
444
+ url = ""
445
+ if ns and cid:
446
+ # Some APIs return id as "{namespace}/{slug}", so extract slug part only
447
+ slug = cid.split("/")[-1]
448
+ url = f"https://huggingface.co/collections/{ns}/{slug}"
449
+ items.append({
450
+ "type": "collection",
451
+ "id": cid,
452
+ "owner": ns,
453
+ "title": _safe_get(it, "title", ""),
454
+ "url": url,
455
+ "visibility": "public",
456
+ "access": "accessible",
457
+ })
458
+ return json.dumps({"results": items, "status": status, "error": err}, ensure_ascii=False)
459
+
460
+
461
+ class HFCollectionGetTool(Tool):
462
+ name = "hf_collection_get"
463
+ description = (
464
+ "Get collection details by namespace and slug id (as in URL). Use after listing to inspect items."
465
+ )
466
+ inputs = {
467
+ "namespace": {"type": "string", "description": "Collection owner/namespace"},
468
+ "slug_id": {"type": "string", "description": "slug-id part as shown in URL"},
469
+ }
470
+ output_type = "string"
471
+
472
+ def forward(self, namespace: str, slug_id: str) -> str:
473
+ data, status, err = _http_get_json(f"https://huggingface.co/api/collections/{namespace}/{slug_id}")
474
+ item = data if isinstance(data, dict) else {}
475
+ return json.dumps({"item": item, "status": status, "error": err}, ensure_ascii=False)
476
+
477
+
478
+ class HFPaperInfoTool(Tool):
479
+ name = "hf_paper_info"
480
+ description = (
481
+ "Fetch paper metadata by arXiv id (e.g., 1706.03762). Combine with hf_paper_repos to find related repos."
482
+ )
483
+ inputs = {"arxiv_id": {"type": "string", "description": "arXiv identifier, e.g. 1706.03762"}}
484
+ output_type = "string"
485
+
486
+ def forward(self, arxiv_id: str) -> str:
487
+ data, status, err = _http_get_json(f"https://huggingface.co/api/papers/{arxiv_id}")
488
+ item = data if isinstance(data, dict) else {}
489
+ return json.dumps({"item": item, "status": status, "error": err}, ensure_ascii=False)
490
+
491
+
492
+ class HFPaperReposTool(Tool):
493
+ name = "hf_paper_repos"
494
+ description = (
495
+ "List repos (models/datasets/spaces) referencing an arXiv id. Use alongside hf_paper_info to map research → repos."
496
+ )
497
+ inputs = {"arxiv_id": {"type": "string", "description": "arXiv identifier, e.g. 1706.03762"}}
498
+ output_type = "string"
499
+
500
+ def forward(self, arxiv_id: str) -> str:
501
+ data, status, err = _http_get_json(f"https://huggingface.co/api/arxiv/{arxiv_id}/repos")
502
+ results = data if isinstance(data, list) else []
503
+ return json.dumps({"results": results, "status": status, "error": err}, ensure_ascii=False)
504
+
505
+
506
+ class HFDailyPapersTool(Tool):
507
+ name = "hf_daily_papers"
508
+ description = (
509
+ "Get the daily curated papers list from Hugging Face. Useful for current research trends."
510
+ )
511
+ inputs = {}
512
+ output_type = "string"
513
+
514
+ def forward(self) -> str: # type: ignore[override]
515
+ data, status, err = _http_get_json("https://huggingface.co/api/daily_papers")
516
+ results = data if isinstance(data, list) else []
517
+ return json.dumps({"results": results, "status": status, "error": err}, ensure_ascii=False)
518
+
519
+
520
+ class HFRepoInfoTool(Tool):
521
+ name = "hf_repo_info"
522
+ description = (
523
+ "Generic repo info for model|dataset|space by id. Use if you already know the type and want raw item metadata."
524
+ )
525
+ inputs = {
526
+ "repo_type": {"type": "string", "description": "model|dataset|space"},
527
+ "repo_id": {"type": "string", "description": "Owner/name or id"},
528
+ }
529
+ output_type = "string"
530
+
531
+ def forward(self, repo_type: str, repo_id: str) -> str:
532
+ repo_type = (repo_type or "").strip().lower()
533
+ if repo_type not in {"model", "dataset", "space"}:
534
+ return json.dumps({"error": "invalid_repo_type", "status": 400})
535
+ base = {"model": "models", "dataset": "datasets", "space": "spaces"}[repo_type]
536
+ data, status, err = _http_get_json(f"https://huggingface.co/api/{base}/{repo_id}", {"full": "true"})
537
+ item = data if isinstance(data, dict) else {}
538
+ return json.dumps({"item": item, "status": status, "error": err}, ensure_ascii=False)
539
+
540
+
541
+ class HFSiteSearchTool(Tool):
542
+ name = "hf_site_search"
543
+ description = (
544
+ "Search within huggingface.co for blogs, Learn pages, and posts (DuckDuckGo). Prefer this for tutorials and docs not covered by Hub APIs. "
545
+ "Defaults: limit=10 to reduce rate limiting. Returns JSON with `results`, `status`, and `error`."
546
+ )
547
+ inputs = {
548
+ "query": {"type": "string", "description": "Search query. 'site:huggingface.co' will be added if missing."},
549
+ "limit": {"type": "number", "description": "Max results (default 20)", "nullable": True},
550
+ }
551
+ output_type = "string"
552
+
553
+ def forward(self, query: str, limit: Optional[int] = None) -> str:
554
+ try:
555
+ from duckduckgo_search import DDGS
556
+ except Exception:
557
+ return json.dumps({"results": [], "status": 500, "error": "duckduckgo_search_not_installed"})
558
+
559
+ q = f"site:huggingface.co {query}" if "huggingface.co" not in query else query
560
+ lim = int(limit) if limit else 10
561
+ results: List[Dict] = []
562
+ try:
563
+ with DDGS() as ddgs:
564
+ for r in ddgs.text(q, safesearch="moderate", timelimit=None, max_results=lim):
565
+ if not isinstance(r, dict):
566
+ continue
567
+ results.append({
568
+ "type": "site",
569
+ "title": r.get("title"),
570
+ "url": r.get("href"),
571
+ "snippet": r.get("body"),
572
+ "date": r.get("date"),
573
+ })
574
+ except Exception as e:
575
+ return json.dumps({"results": [], "status": 500, "error": str(e)})
576
+ return json.dumps({"results": results, "status": 200, "error": ""}, ensure_ascii=False)
577
+
578
+
579
+ class HFReportGenerateTool(Tool):
580
+ name = "hf_report_generate"
581
+ description = (
582
+ "Generate a full HTML report from aggregated JSON (string). The app prefers its own dashboard, but this can render custom summaries."
583
+ )
584
+ inputs = {
585
+ "data_json": {"type": "string", "description": "Aggregated search results JSON"},
586
+ "title": {"type": "string", "description": "Report title", "nullable": True},
587
+ }
588
+ output_type = "string"
589
+
590
+ def forward(self, data_json: str, title: Optional[str] = None) -> str:
591
+ try:
592
+ data = json.loads(data_json) if data_json else {}
593
+ except Exception as e:
594
+ data = {"parse_error": str(e)}
595
+ title = title or "Hugging Face Research Report"
596
+
597
+ def card_html(item: Dict) -> str:
598
+ badge = ""
599
+ vis = item.get("visibility")
600
+ access = item.get("access")
601
+ if vis in {"private", "gated"} or access == "no_access":
602
+ badge = f"<span class=badge badge-warn>{vis or 'restricted'}</span>"
603
+ meta = []
604
+ if item.get("task"):
605
+ meta.append(f"<span class=meta>Task: {item['task']}</span>")
606
+ if item.get("tags"):
607
+ meta.append(f"<span class=meta>Tags: {', '.join(item['tags'][:5])}</span>")
608
+ if item.get("downloads") is not None:
609
+ meta.append(f"<span class=stat>⬇️ {item['downloads']}</span>")
610
+ if item.get("likes") is not None:
611
+ meta.append(f"<span class=stat>❤️ {item['likes']}</span>")
612
+ if item.get("updatedAt"):
613
+ meta.append(f"<span class=meta>Updated: {item['updatedAt']}</span>")
614
+ desc = (item.get("description") or "").strip()
615
+ if len(desc) > 220:
616
+ desc = desc[:217] + "..."
617
+ return (
618
+ "<div class=card>"
619
+ f"<div class=card-title><a href='{item.get('url')}' target=_blank rel=noopener>{item.get('id')}</a> {badge}</div>"
620
+ f"<div class=card-subtitle>{item.get('type','')} • {item.get('owner','')}</div>"
621
+ f"<div class=card-desc>{desc}</div>"
622
+ f"<div class=card-meta>{' | '.join(meta)}</div>"
623
+ "</div>"
624
+ )
625
+
626
+ def section(title_text: str, items: List[Dict]) -> str:
627
+ if not items:
628
+ return ""
629
+ cards = "\n".join(card_html(it) for it in items)
630
+ return f"<section><h2>{title_text}</h2><div class=cards>{cards}</div></section>"
631
+
632
+ # Accept either a dict with category keys or a flat list
633
+ models = data.get("models") or data.get("Models") or []
634
+ datasets = data.get("datasets") or data.get("Datasets") or []
635
+ spaces = data.get("spaces") or data.get("Spaces") or []
636
+ papers = data.get("papers") or data.get("Papers") or []
637
+ daily_papers = data.get("daily_papers") or data.get("DailyPapers") or []
638
+ users = data.get("users") or data.get("Users") or []
639
+ collections = data.get("collections") or data.get("Collections") or []
640
+ site = data.get("site") or data.get("Site") or []
641
+
642
+ html = f"""<!DOCTYPE html>
643
+ <html lang=\"en\">
644
+ <head>
645
+ <meta charset=\"utf-8\" />
646
+ <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
647
+ <title>{title}</title>
648
+ <style>
649
+ :root {{ --bg:#0b0d12; --fg:#e6e9ef; --muted:#9aa4b2; --card:#121621; --accent:#5ac8fa; --warn:#eab308; }}
650
+ body {{ background:var(--bg); color:var(--fg); font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Inter, Arial, sans-serif; margin:0; padding:24px; }}
651
+ h1 {{ font-size: 24px; margin: 0 0 12px; }}
652
+ h2 {{ font-size: 18px; margin: 24px 0 8px; color: var(--accent); }}
653
+ .container {{ max-width: 1120px; margin: 0 auto; }}
654
+ .subtitle {{ color: var(--muted); margin-bottom: 18px; }}
655
+ .cards {{ display: grid; grid-template-columns: repeat(auto-fill, minmax(280px,1fr)); gap: 12px; }}
656
+ .card {{ background: var(--card); border: 1px solid rgba(255,255,255,0.06); border-radius: 10px; padding: 12px; }}
657
+ .card-title {{ font-weight: 600; margin-bottom: 4px; overflow-wrap:anywhere; }}
658
+ .card-subtitle {{ color: var(--muted); font-size: 12px; margin-bottom: 8px; }}
659
+ .card-desc {{ font-size: 13px; line-height: 1.45; min-height: 28px; margin-bottom: 8px; color: #d2d7df; }}
660
+ .card-meta {{ font-size: 12px; color: var(--muted); display:flex; flex-wrap:wrap; gap:8px; }}
661
+ .badge {{ background: rgba(234, 179, 8, 0.15); color: #facc15; border:1px solid rgba(250,204,21,0.35); border-radius: 999px; padding: 2px 8px; font-size: 11px; margin-left: 6px; }}
662
+ .badge-warn {{ background: rgba(234, 179, 8, 0.15); }}
663
+ a {{ color: #93c5fd; text-decoration: none; }}
664
+ a:hover {{ text-decoration: underline; }}
665
+ section {{ margin-bottom: 18px; }}
666
+ </style>
667
+ <script>
668
+ function printToPDF() {{ window.print(); }}
669
+ </script>
670
+ <link rel=\"stylesheet\" href=\"https://cdnjs.cloudflare.com/ajax/libs/modern-normalize/2.0.0/modern-normalize.min.css\" />
671
+ <meta name=\"robots\" content=\"noindex\" />
672
+ <meta name=\"referrer\" content=\"no-referrer\" />
673
+ <meta http-equiv=\"Content-Security-Policy\" content=\"default-src 'self' 'unsafe-inline' data: https://cdnjs.cloudflare.com; img-src * data:; style-src 'self' 'unsafe-inline' https://cdnjs.cloudflare.com;\" />
674
+ </head>
675
+ <body>
676
+ <div class=\"container\">
677
+ <div style=\"display:flex; align-items:center; justify-content:space-between; gap:12px;\">
678
+ <div>
679
+ <h1>{title}</h1>
680
+ <div class=\"subtitle\">Generated by Hugging Search</div>
681
+ </div>
682
+ <button onclick=\"printToPDF()\" style=\"background:#1f2937;color:#e5e7eb;border:1px solid rgba(255,255,255,0.08);border-radius:8px;padding:8px 10px;cursor:pointer;\">Print to PDF</button>
683
+ </div>
684
+ {section("Models", models)}
685
+ {section("Datasets", datasets)}
686
+ {section("Spaces", spaces)}
687
+ {section("Papers", papers)}
688
+ {section("Daily Papers", daily_papers)}
689
+ {section("Users", users)}
690
+ {section("Collections", collections)}
691
+ {section("Site results", site)}
692
+ </div>
693
+ </body>
694
+ </html>
695
+ """
696
+ return html
697
+
698
+
699
+ class HFDashboardReportTool(Tool):
700
+ name = "hf_generate_dashboard_report"
701
+ description = (
702
+ "One-click dashboard report from a query. Fetches public models/datasets/spaces/daily_papers and returns a full HTML dashboard."
703
+ )
704
+ inputs = {
705
+ "query": {"type": "string", "description": "User intent / keywords to search across Hub"},
706
+ "limit": {"type": "number", "description": "Max results per category (default 20)", "nullable": True},
707
+ }
708
+ output_type = "string"
709
+
710
+ def forward(self, query: str, limit: Optional[int] = None) -> str:
711
+ lim = int(limit) if limit else 20
712
+ params_common = {"search": query, "sort": "downloads", "direction": "descending", "limit": lim}
713
+ # Fetch categories
714
+ m_data, m_status, _ = _http_get_json("https://huggingface.co/api/models", params_common)
715
+ d_data, d_status, _ = _http_get_json("https://huggingface.co/api/datasets", params_common)
716
+ s_data, s_status, _ = _http_get_json("https://huggingface.co/api/spaces", {"search": query, "sort": "likes", "direction": "descending", "limit": lim})
717
+ dp_data, dp_status, _ = _http_get_json("https://huggingface.co/api/daily_papers")
718
+
719
+ models: List[Dict] = []
720
+ if isinstance(m_data, list):
721
+ for it in m_data[:lim]:
722
+ model_id = _safe_get(it, "id") or _safe_get(it, "modelId") or ""
723
+ if not model_id:
724
+ continue
725
+ owner_name = model_id.split("/")[0] if "/" in model_id else ""
726
+ visibility, access = _visibility_from_item(it)
727
+ norm = _norm_common(model_id, "model", owner_name, "")
728
+ norm.update({
729
+ "tags": _safe_get(it, "tags", []),
730
+ "task": _safe_get(it, "pipeline_tag"),
731
+ "likes": _safe_get(it, "likes", 0),
732
+ "downloads": _safe_get(it, "downloads", 0),
733
+ "updatedAt": _safe_get(it, "lastModified"),
734
+ "visibility": visibility,
735
+ "access": access,
736
+ })
737
+ models.append(norm)
738
+
739
+ datasets: List[Dict] = []
740
+ if isinstance(d_data, list):
741
+ for it in d_data[:lim]:
742
+ ds_id = _safe_get(it, "id") or _safe_get(it, "datasetId") or ""
743
+ if not ds_id:
744
+ continue
745
+ owner_name = ds_id.split("/")[0] if "/" in ds_id else ""
746
+ visibility, access = _visibility_from_item(it)
747
+ norm = _norm_common(ds_id, "dataset", owner_name, "")
748
+ norm.update({
749
+ "tags": _safe_get(it, "tags", []),
750
+ "likes": _safe_get(it, "likes", 0),
751
+ "downloads": _safe_get(it, "downloads", 0),
752
+ "updatedAt": _safe_get(it, "lastModified"),
753
+ "visibility": visibility,
754
+ "access": access,
755
+ })
756
+ datasets.append(norm)
757
+
758
+ spaces: List[Dict] = []
759
+ if isinstance(s_data, list):
760
+ for it in s_data[:lim]:
761
+ sp_id = _safe_get(it, "id") or _safe_get(it, "spaceId") or ""
762
+ if not sp_id:
763
+ continue
764
+ owner_name = sp_id.split("/")[0] if "/" in sp_id else ""
765
+ visibility, access = _visibility_from_item(it)
766
+ norm = _norm_common(sp_id, "space", owner_name, "")
767
+ norm.update({
768
+ "tags": _safe_get(it, "tags", []),
769
+ "likes": _safe_get(it, "likes", 0),
770
+ "downloads": _safe_get(it, "downloads", 0),
771
+ "updatedAt": _safe_get(it, "lastModified"),
772
+ "visibility": visibility,
773
+ "access": access,
774
+ })
775
+ spaces.append(norm)
776
+
777
+ papers = dp_data if isinstance(dp_data, list) else []
778
+
779
+ # Build dashboard HTML
780
+ def card_html(item: Dict) -> str:
781
+ badge = ""
782
+ if item.get("visibility") in {"private", "gated"} or item.get("access") == "no_access":
783
+ badge = f"<span class=badge badge-warn>{item.get('visibility','restricted')}</span>"
784
+ meta = []
785
+ if item.get("task"):
786
+ meta.append(f"<span class=meta>Task: {item['task']}</span>")
787
+ if item.get("tags"):
788
+ meta.append(f"<span class=meta>Tags: {', '.join(item['tags'][:5])}</span>")
789
+ if item.get("downloads") is not None:
790
+ meta.append(f"<span class=stat>⬇️ {item['downloads']}</span>")
791
+ if item.get("likes") is not None:
792
+ meta.append(f"<span class=stat>❤️ {item['likes']}</span>")
793
+ if item.get("updatedAt"):
794
+ meta.append(f"<span class=meta>Updated: {item['updatedAt']}</span>")
795
+ desc = (item.get("description") or "").strip()
796
+ if len(desc) > 200:
797
+ desc = desc[:197] + "..."
798
+ return (
799
+ "<div class=card>"
800
+ f"<div class=card-title><a href='{item.get('url')}' target=_blank rel=noopener>{item.get('id')}</a> {badge}</div>"
801
+ f"<div class=card-subtitle>{item.get('type','')} • {item.get('owner','')}</div>"
802
+ f"<div class=card-desc>{desc}</div>"
803
+ f"<div class=card-meta>{' | '.join(meta)}</div>"
804
+ "</div>"
805
+ )
806
+
807
+ def section(title_text: str, items: List[Dict]) -> str:
808
+ if not items:
809
+ return ""
810
+ cards = "\n".join(card_html(it) for it in items)
811
+ return f"<section><h2>{title_text}</h2><div class=cards>{cards}</div></section>"
812
+
813
+ html = f"""<!DOCTYPE html>
814
+ <html lang=\"en\">
815
+ <head>
816
+ <meta charset=\"utf-8\" />
817
+ <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
818
+ <title>Hugging Search — Dashboard</title>
819
+ <style>
820
+ :root {{ --bg:#0b0d12; --fg:#e6e9ef; --muted:#9aa4b2; --card:#121621; --accent:#5ac8fa; --warn:#eab308; }}
821
+ body {{ background:var(--bg); color:var(--fg); font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Inter, Arial, sans-serif; margin:0; padding:24px; }}
822
+ .container {{ max-width: 1200px; margin: 0 auto; }}
823
+ .header {{ display:flex; justify-content:space-between; align-items:center; gap:12px; margin-bottom: 16px; }}
824
+ .title {{ font-size: 22px; margin: 0; }}
825
+ .subtitle {{ color: var(--muted); }}
826
+ .stats {{ display:flex; gap:10px; flex-wrap:wrap; margin: 8px 0 18px; }}
827
+ .stat-chip {{ background: var(--card); border: 1px solid rgba(255,255,255,0.08); border-radius: 999px; padding: 6px 10px; font-size: 12px; color: var(--muted); }}
828
+ .cards {{ display: grid; grid-template-columns: repeat(auto-fill, minmax(280px,1fr)); gap: 12px; }}
829
+ .card {{ background: var(--card); border: 1px solid rgba(255,255,255,0.06); border-radius: 10px; padding: 12px; }}
830
+ .card-title {{ font-weight: 600; margin-bottom: 4px; overflow-wrap:anywhere; }}
831
+ .card-subtitle {{ color: var(--muted); font-size: 12px; margin-bottom: 8px; }}
832
+ .card-desc {{ font-size: 13px; line-height: 1.45; min-height: 28px; margin-bottom: 8px; color: #d2d7df; }}
833
+ .card-meta {{ font-size: 12px; color: var(--muted); display:flex; flex-wrap:wrap; gap:8px; }}
834
+ .badge {{ background: rgba(234, 179, 8, 0.15); color: #facc15; border:1px solid rgba(250,204,21,0.35); border-radius: 999px; padding: 2px 8px; font-size: 11px; margin-left: 6px; }}
835
+ h2 {{ font-size: 16px; margin: 18px 0 8px; color: var(--accent); }}
836
+ .actions {{ display:flex; gap:8px; align-items:center; }}
837
+ button {{ background:#1f2937;color:#e5e7eb;border:1px solid rgba(255,255,255,0.08);border-radius:8px;padding:8px 10px;cursor:pointer; }}
838
+ </style>
839
+ <script>
840
+ function printToPDF() {{ window.print(); }}
841
+ </script>
842
+ </head>
843
+ <body>
844
+ <div class=\"container\">
845
+ <div class=\"header\">
846
+ <div>
847
+ <div class=\"title\">Hugging Search — Dashboard</div>
848
+ <div class=\"subtitle\">Query: {query}</div>
849
+ </div>
850
+ <div class=\"actions\"><button onclick=\"printToPDF()\">Print to PDF</button></div>
851
+ </div>
852
+ <div class=\"stats\">
853
+ <div class=\"stat-chip\">Models: {len(models)}</div>
854
+ <div class=\"stat-chip\">Datasets: {len(datasets)}</div>
855
+ <div class=\"stat-chip\">Spaces: {len(spaces)}</div>
856
+ <div class=\"stat-chip\">Daily papers: {len(papers) if isinstance(papers,list) else 0}</div>
857
+ </div>
858
+ {section("Models", models)}
859
+ {section("Datasets", datasets)}
860
+ {section("Spaces", spaces)}
861
+ </div>
862
+ </body>
863
+ </html>
864
+ """
865
+ return html
866
+
867
+
scripts/mdconvert.py ADDED
@@ -0,0 +1,982 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is copied from Magentic-one's great repo: https://github.com/microsoft/autogen/blob/v0.4.4/python/packages/autogen-magentic-one/src/autogen_magentic_one/markdown_browser/mdconvert.py
2
+ # Thanks to Microsoft researchers for open-sourcing this!
3
+ # type: ignore
4
+ import base64
5
+ import copy
6
+ import html
7
+ import json
8
+ import mimetypes
9
+ import os
10
+ import re
11
+ import shutil
12
+ import subprocess
13
+ import sys
14
+ import tempfile
15
+ import traceback
16
+ from typing import Any, Dict, List, Optional, Union
17
+ from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
18
+
19
+ import mammoth
20
+ import markdownify
21
+ import pandas as pd
22
+ import pdfminer
23
+ import pdfminer.high_level
24
+ import pptx
25
+
26
+ # File-format detection
27
+ import puremagic
28
+ import pydub
29
+ import requests
30
+ import speech_recognition as sr
31
+ from bs4 import BeautifulSoup
32
+ from youtube_transcript_api import YouTubeTranscriptApi
33
+ from youtube_transcript_api.formatters import SRTFormatter
34
+
35
+
36
+ class _CustomMarkdownify(markdownify.MarkdownConverter):
37
+ """
38
+ A custom version of markdownify's MarkdownConverter. Changes include:
39
+
40
+ - Altering the default heading style to use '#', '##', etc.
41
+ - Removing javascript hyperlinks.
42
+ - Truncating images with large data:uri sources.
43
+ - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
44
+ """
45
+
46
+ def __init__(self, **options: Any):
47
+ options["heading_style"] = options.get("heading_style", markdownify.ATX)
48
+ # Explicitly cast options to the expected type if necessary
49
+ super().__init__(**options)
50
+
51
+ def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
52
+ """Same as usual, but be sure to start with a new line"""
53
+ if not convert_as_inline:
54
+ if not re.search(r"^\n", text):
55
+ return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
56
+
57
+ return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
58
+
59
+ def convert_a(self, el: Any, text: str, convert_as_inline: bool):
60
+ """Same as usual converter, but removes Javascript links and escapes URIs."""
61
+ prefix, suffix, text = markdownify.chomp(text) # type: ignore
62
+ if not text:
63
+ return ""
64
+ href = el.get("href")
65
+ title = el.get("title")
66
+
67
+ # Escape URIs and skip non-http or file schemes
68
+ if href:
69
+ try:
70
+ parsed_url = urlparse(href) # type: ignore
71
+ if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
72
+ return "%s%s%s" % (prefix, text, suffix)
73
+ href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
74
+ except ValueError: # It's not clear if this ever gets thrown
75
+ return "%s%s%s" % (prefix, text, suffix)
76
+
77
+ # For the replacement see #29: text nodes underscores are escaped
78
+ if (
79
+ self.options["autolinks"]
80
+ and text.replace(r"\_", "_") == href
81
+ and not title
82
+ and not self.options["default_title"]
83
+ ):
84
+ # Shortcut syntax
85
+ return "<%s>" % href
86
+ if self.options["default_title"] and not title:
87
+ title = href
88
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
89
+ return "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text
90
+
91
+ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
92
+ """Same as usual converter, but removes data URIs"""
93
+
94
+ alt = el.attrs.get("alt", None) or ""
95
+ src = el.attrs.get("src", None) or ""
96
+ title = el.attrs.get("title", None) or ""
97
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
98
+ if convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"]:
99
+ return alt
100
+
101
+ # Remove dataURIs
102
+ if src.startswith("data:"):
103
+ src = src.split(",")[0] + "..."
104
+
105
+ return "![%s](%s%s)" % (alt, src, title_part)
106
+
107
+ def convert_soup(self, soup: Any) -> str:
108
+ return super().convert_soup(soup) # type: ignore
109
+
110
+
111
+ class DocumentConverterResult:
112
+ """The result of converting a document to text."""
113
+
114
+ def __init__(self, title: Union[str, None] = None, text_content: str = ""):
115
+ self.title: Union[str, None] = title
116
+ self.text_content: str = text_content
117
+
118
+
119
+ class DocumentConverter:
120
+ """Abstract superclass of all DocumentConverters."""
121
+
122
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
123
+ raise NotImplementedError()
124
+
125
+
126
+ class PlainTextConverter(DocumentConverter):
127
+ """Anything with content type text/plain"""
128
+
129
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
130
+ # Guess the content type from any file extension that might be around
131
+ content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))
132
+
133
+ # Only accept text files
134
+ if content_type is None:
135
+ return None
136
+ # elif "text/" not in content_type.lower():
137
+ # return None
138
+
139
+ text_content = ""
140
+ with open(local_path, "rt", encoding="utf-8") as fh:
141
+ text_content = fh.read()
142
+ return DocumentConverterResult(
143
+ title=None,
144
+ text_content=text_content,
145
+ )
146
+
147
+
148
+ class JavaScriptConverter(DocumentConverter):
149
+ """Handle JavaScript files (.js, .mjs) as plain text"""
150
+
151
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
152
+ # Check if it's a JavaScript file
153
+ extension = kwargs.get("file_extension", "")
154
+ if extension.lower() not in [".js", ".mjs"]:
155
+ return None
156
+
157
+ try:
158
+ # Read the file as text
159
+ with open(local_path, "rt", encoding="utf-8") as fh:
160
+ text_content = fh.read()
161
+
162
+ return DocumentConverterResult(
163
+ title=f"JavaScript file: {os.path.basename(local_path)}",
164
+ text_content=text_content,
165
+ )
166
+ except Exception as e:
167
+ # If UTF-8 fails, try with different encoding
168
+ try:
169
+ with open(local_path, "rt", encoding="latin-1") as fh:
170
+ text_content = fh.read()
171
+
172
+ return DocumentConverterResult(
173
+ title=f"JavaScript file: {os.path.basename(local_path)}",
174
+ text_content=text_content,
175
+ )
176
+ except Exception:
177
+ return None
178
+
179
+
180
+ class HtmlConverter(DocumentConverter):
181
+ """Anything with content type text/html"""
182
+
183
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
184
+ # Bail if not html
185
+ extension = kwargs.get("file_extension", "")
186
+ if extension.lower() not in [".html", ".htm"]:
187
+ return None
188
+
189
+ result = None
190
+ with open(local_path, "rt", encoding="utf-8") as fh:
191
+ result = self._convert(fh.read())
192
+
193
+ return result
194
+
195
+ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
196
+ """Helper function that converts and HTML string."""
197
+
198
+ # Parse the string
199
+ soup = BeautifulSoup(html_content, "html.parser")
200
+
201
+ # Remove javascript and style blocks
202
+ for script in soup(["script", "style"]):
203
+ script.extract()
204
+
205
+ # Print only the main content
206
+ body_elm = soup.find("body")
207
+ webpage_text = ""
208
+ if body_elm:
209
+ webpage_text = _CustomMarkdownify().convert_soup(body_elm)
210
+ else:
211
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
212
+
213
+ assert isinstance(webpage_text, str)
214
+
215
+ return DocumentConverterResult(
216
+ title=None if soup.title is None else soup.title.string, text_content=webpage_text
217
+ )
218
+
219
+
220
+ class WikipediaConverter(DocumentConverter):
221
+ """Handle Wikipedia pages separately, focusing only on the main document content."""
222
+
223
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
224
+ # Bail if not Wikipedia
225
+ extension = kwargs.get("file_extension", "")
226
+ if extension.lower() not in [".html", ".htm"]:
227
+ return None
228
+ url = kwargs.get("url", "")
229
+ if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
230
+ return None
231
+
232
+ # Parse the file
233
+ soup = None
234
+ with open(local_path, "rt", encoding="utf-8") as fh:
235
+ soup = BeautifulSoup(fh.read(), "html.parser")
236
+
237
+ # Remove javascript and style blocks
238
+ for script in soup(["script", "style"]):
239
+ script.extract()
240
+
241
+ # Print only the main content
242
+ body_elm = soup.find("div", {"id": "mw-content-text"})
243
+ title_elm = soup.find("span", {"class": "mw-page-title-main"})
244
+
245
+ webpage_text = ""
246
+ main_title = None if soup.title is None else soup.title.string
247
+
248
+ if body_elm:
249
+ # What's the title
250
+ if title_elm and len(title_elm) > 0:
251
+ main_title = title_elm.string # type: ignore
252
+ assert isinstance(main_title, str)
253
+
254
+ # Convert the page
255
+ webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(body_elm)
256
+ else:
257
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
258
+
259
+ return DocumentConverterResult(
260
+ title=main_title,
261
+ text_content=webpage_text,
262
+ )
263
+
264
+
265
+ class YouTubeConverter(DocumentConverter):
266
+ """Handle YouTube specially, focusing on the video title, description, and transcript."""
267
+
268
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
269
+ # Bail if not YouTube
270
+ extension = kwargs.get("file_extension", "")
271
+ if extension.lower() not in [".html", ".htm"]:
272
+ return None
273
+ url = kwargs.get("url", "")
274
+ if not url.startswith("https://www.youtube.com/watch?"):
275
+ return None
276
+
277
+ # Parse the file
278
+ soup = None
279
+ with open(local_path, "rt", encoding="utf-8") as fh:
280
+ soup = BeautifulSoup(fh.read(), "html.parser")
281
+
282
+ # Read the meta tags
283
+ assert soup.title is not None and soup.title.string is not None
284
+ metadata: Dict[str, str] = {"title": soup.title.string}
285
+ for meta in soup(["meta"]):
286
+ for a in meta.attrs:
287
+ if a in ["itemprop", "property", "name"]:
288
+ metadata[meta[a]] = meta.get("content", "")
289
+ break
290
+
291
+ # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
292
+ try:
293
+ for script in soup(["script"]):
294
+ content = script.text
295
+ if "ytInitialData" in content:
296
+ lines = re.split(r"\r?\n", content)
297
+ obj_start = lines[0].find("{")
298
+ obj_end = lines[0].rfind("}")
299
+ if obj_start >= 0 and obj_end >= 0:
300
+ data = json.loads(lines[0][obj_start : obj_end + 1])
301
+ attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
302
+ if attrdesc:
303
+ metadata["description"] = str(attrdesc["content"])
304
+ break
305
+ except Exception:
306
+ pass
307
+
308
+ # Start preparing the page
309
+ webpage_text = "# YouTube\n"
310
+
311
+ title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore
312
+ assert isinstance(title, str)
313
+
314
+ if title:
315
+ webpage_text += f"\n## {title}\n"
316
+
317
+ stats = ""
318
+ views = self._get(metadata, ["interactionCount"]) # type: ignore
319
+ if views:
320
+ stats += f"- **Views:** {views}\n"
321
+
322
+ keywords = self._get(metadata, ["keywords"]) # type: ignore
323
+ if keywords:
324
+ stats += f"- **Keywords:** {keywords}\n"
325
+
326
+ runtime = self._get(metadata, ["duration"]) # type: ignore
327
+ if runtime:
328
+ stats += f"- **Runtime:** {runtime}\n"
329
+
330
+ if len(stats) > 0:
331
+ webpage_text += f"\n### Video Metadata\n{stats}\n"
332
+
333
+ description = self._get(metadata, ["description", "og:description"]) # type: ignore
334
+ if description:
335
+ webpage_text += f"\n### Description\n{description}\n"
336
+
337
+ transcript_text = ""
338
+ parsed_url = urlparse(url) # type: ignore
339
+ params = parse_qs(parsed_url.query) # type: ignore
340
+ if "v" in params:
341
+ assert isinstance(params["v"][0], str)
342
+ video_id = str(params["v"][0])
343
+ try:
344
+ # Must be a single transcript.
345
+ transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore
346
+ # transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
347
+ # Alternative formatting:
348
+ transcript_text = SRTFormatter().format_transcript(transcript)
349
+ except Exception:
350
+ pass
351
+ if transcript_text:
352
+ webpage_text += f"\n### Transcript\n{transcript_text}\n"
353
+
354
+ title = title if title else soup.title.string
355
+ assert isinstance(title, str)
356
+
357
+ return DocumentConverterResult(
358
+ title=title,
359
+ text_content=webpage_text,
360
+ )
361
+
362
+ def _get(self, metadata: Dict[str, str], keys: List[str], default: Union[str, None] = None) -> Union[str, None]:
363
+ for k in keys:
364
+ if k in metadata:
365
+ return metadata[k]
366
+ return default
367
+
368
+ def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type
369
+ if isinstance(json, list):
370
+ for elm in json:
371
+ ret = self._findKey(elm, key)
372
+ if ret is not None:
373
+ return ret
374
+ elif isinstance(json, dict):
375
+ for k in json:
376
+ if k == key:
377
+ return json[k]
378
+ else:
379
+ ret = self._findKey(json[k], key)
380
+ if ret is not None:
381
+ return ret
382
+ return None
383
+
384
+
385
+ class PdfConverter(DocumentConverter):
386
+ """
387
+ Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
388
+ """
389
+
390
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
391
+ # Bail if not a PDF
392
+ extension = kwargs.get("file_extension", "")
393
+ if extension.lower() != ".pdf":
394
+ return None
395
+
396
+ return DocumentConverterResult(
397
+ title=None,
398
+ text_content=pdfminer.high_level.extract_text(local_path),
399
+ )
400
+
401
+
402
+ class DocxConverter(HtmlConverter):
403
+ """
404
+ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
405
+ """
406
+
407
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
408
+ # Bail if not a DOCX
409
+ extension = kwargs.get("file_extension", "")
410
+ if extension.lower() != ".docx":
411
+ return None
412
+
413
+ result = None
414
+ with open(local_path, "rb") as docx_file:
415
+ result = mammoth.convert_to_html(docx_file)
416
+ html_content = result.value
417
+ result = self._convert(html_content)
418
+
419
+ return result
420
+
421
+
422
+ class XlsxConverter(HtmlConverter):
423
+ """
424
+ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
425
+ """
426
+
427
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
428
+ # Bail if not a XLSX
429
+ extension = kwargs.get("file_extension", "")
430
+ if extension.lower() not in [".xlsx", ".xls"]:
431
+ return None
432
+
433
+ sheets = pd.read_excel(local_path, sheet_name=None)
434
+ md_content = ""
435
+ for s in sheets:
436
+ md_content += f"## {s}\n"
437
+ html_content = sheets[s].to_html(index=False)
438
+ md_content += self._convert(html_content).text_content.strip() + "\n\n"
439
+
440
+ return DocumentConverterResult(
441
+ title=None,
442
+ text_content=md_content.strip(),
443
+ )
444
+
445
+
446
+ class PptxConverter(HtmlConverter):
447
+ """
448
+ Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
449
+ """
450
+
451
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
452
+ # Bail if not a PPTX
453
+ extension = kwargs.get("file_extension", "")
454
+ if extension.lower() != ".pptx":
455
+ return None
456
+
457
+ md_content = ""
458
+
459
+ presentation = pptx.Presentation(local_path)
460
+ slide_num = 0
461
+ for slide in presentation.slides:
462
+ slide_num += 1
463
+
464
+ md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
465
+
466
+ title = slide.shapes.title
467
+ for shape in slide.shapes:
468
+ # Pictures
469
+ if self._is_picture(shape):
470
+ # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
471
+ alt_text = ""
472
+ try:
473
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
474
+ except Exception:
475
+ pass
476
+
477
+ # A placeholder name
478
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
479
+ md_content += "\n![" + (alt_text if alt_text else shape.name) + "](" + filename + ")\n"
480
+
481
+ # Tables
482
+ if self._is_table(shape):
483
+ html_table = "<html><body><table>"
484
+ first_row = True
485
+ for row in shape.table.rows:
486
+ html_table += "<tr>"
487
+ for cell in row.cells:
488
+ if first_row:
489
+ html_table += "<th>" + html.escape(cell.text) + "</th>"
490
+ else:
491
+ html_table += "<td>" + html.escape(cell.text) + "</td>"
492
+ html_table += "</tr>"
493
+ first_row = False
494
+ html_table += "</table></body></html>"
495
+ md_content += "\n" + self._convert(html_table).text_content.strip() + "\n"
496
+
497
+ # Text areas
498
+ elif shape.has_text_frame:
499
+ if shape == title:
500
+ md_content += "# " + shape.text.lstrip() + "\n"
501
+ else:
502
+ md_content += shape.text + "\n"
503
+
504
+ md_content = md_content.strip()
505
+
506
+ if slide.has_notes_slide:
507
+ md_content += "\n\n### Notes:\n"
508
+ notes_frame = slide.notes_slide.notes_text_frame
509
+ if notes_frame is not None:
510
+ md_content += notes_frame.text
511
+ md_content = md_content.strip()
512
+
513
+ return DocumentConverterResult(
514
+ title=None,
515
+ text_content=md_content.strip(),
516
+ )
517
+
518
+ def _is_picture(self, shape):
519
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
520
+ return True
521
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
522
+ if hasattr(shape, "image"):
523
+ return True
524
+ return False
525
+
526
+ def _is_table(self, shape):
527
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
528
+ return True
529
+ return False
530
+
531
+
532
+ class MediaConverter(DocumentConverter):
533
+ """
534
+ Abstract class for multi-modal media (e.g., images and audio)
535
+ """
536
+
537
+ def _get_metadata(self, local_path):
538
+ exiftool = shutil.which("exiftool")
539
+ if not exiftool:
540
+ return None
541
+ else:
542
+ try:
543
+ result = subprocess.run([exiftool, "-json", local_path], capture_output=True, text=True).stdout
544
+ return json.loads(result)[0]
545
+ except Exception:
546
+ return None
547
+
548
+
549
+ class WavConverter(MediaConverter):
550
+ """
551
+ Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
552
+ """
553
+
554
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
555
+ # Bail if not a XLSX
556
+ extension = kwargs.get("file_extension", "")
557
+ if extension.lower() != ".wav":
558
+ return None
559
+
560
+ md_content = ""
561
+
562
+ # Add metadata
563
+ metadata = self._get_metadata(local_path)
564
+ if metadata:
565
+ for f in [
566
+ "Title",
567
+ "Artist",
568
+ "Author",
569
+ "Band",
570
+ "Album",
571
+ "Genre",
572
+ "Track",
573
+ "DateTimeOriginal",
574
+ "CreateDate",
575
+ "Duration",
576
+ ]:
577
+ if f in metadata:
578
+ md_content += f"{f}: {metadata[f]}\n"
579
+
580
+ # Transcribe
581
+ try:
582
+ transcript = self._transcribe_audio(local_path)
583
+ md_content += "\n\n### Audio Transcript:\n" + ("[No speech detected]" if transcript == "" else transcript)
584
+ except Exception:
585
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
586
+
587
+ return DocumentConverterResult(
588
+ title=None,
589
+ text_content=md_content.strip(),
590
+ )
591
+
592
+ def _transcribe_audio(self, local_path) -> str:
593
+ recognizer = sr.Recognizer()
594
+ with sr.AudioFile(local_path) as source:
595
+ audio = recognizer.record(source)
596
+ return recognizer.recognize_google(audio).strip()
597
+
598
+
599
+ class Mp3Converter(WavConverter):
600
+ """
601
+ Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
602
+ """
603
+
604
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
605
+ # Bail if not a MP3
606
+ extension = kwargs.get("file_extension", "")
607
+ if extension.lower() != ".mp3":
608
+ return None
609
+
610
+ md_content = ""
611
+
612
+ # Add metadata
613
+ metadata = self._get_metadata(local_path)
614
+ if metadata:
615
+ for f in [
616
+ "Title",
617
+ "Artist",
618
+ "Author",
619
+ "Band",
620
+ "Album",
621
+ "Genre",
622
+ "Track",
623
+ "DateTimeOriginal",
624
+ "CreateDate",
625
+ "Duration",
626
+ ]:
627
+ if f in metadata:
628
+ md_content += f"{f}: {metadata[f]}\n"
629
+
630
+ # Transcribe
631
+ handle, temp_path = tempfile.mkstemp(suffix=".wav")
632
+ os.close(handle)
633
+ try:
634
+ sound = pydub.AudioSegment.from_mp3(local_path)
635
+ sound.export(temp_path, format="wav")
636
+
637
+ _args = dict()
638
+ _args.update(kwargs)
639
+ _args["file_extension"] = ".wav"
640
+
641
+ try:
642
+ transcript = super()._transcribe_audio(temp_path).strip()
643
+ md_content += "\n\n### Audio Transcript:\n" + (
644
+ "[No speech detected]" if transcript == "" else transcript
645
+ )
646
+ except Exception:
647
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
648
+
649
+ finally:
650
+ os.unlink(temp_path)
651
+
652
+ # Return the result
653
+ return DocumentConverterResult(
654
+ title=None,
655
+ text_content=md_content.strip(),
656
+ )
657
+
658
+
659
+ class ImageConverter(MediaConverter):
660
+ """
661
+ Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
662
+ """
663
+
664
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
665
+ # Bail if not a XLSX
666
+ extension = kwargs.get("file_extension", "")
667
+ if extension.lower() not in [".jpg", ".jpeg", ".png"]:
668
+ return None
669
+
670
+ md_content = ""
671
+
672
+ # Add metadata
673
+ metadata = self._get_metadata(local_path)
674
+ if metadata:
675
+ for f in [
676
+ "ImageSize",
677
+ "Title",
678
+ "Caption",
679
+ "Description",
680
+ "Keywords",
681
+ "Artist",
682
+ "Author",
683
+ "DateTimeOriginal",
684
+ "CreateDate",
685
+ "GPSPosition",
686
+ ]:
687
+ if f in metadata:
688
+ md_content += f"{f}: {metadata[f]}\n"
689
+
690
+ # Try describing the image with GPTV
691
+ mlm_client = kwargs.get("mlm_client")
692
+ mlm_model = kwargs.get("mlm_model")
693
+ if mlm_client is not None and mlm_model is not None:
694
+ md_content += (
695
+ "\n# Description:\n"
696
+ + self._get_mlm_description(
697
+ local_path, extension, mlm_client, mlm_model, prompt=kwargs.get("mlm_prompt")
698
+ ).strip()
699
+ + "\n"
700
+ )
701
+
702
+ return DocumentConverterResult(
703
+ title=None,
704
+ text_content=md_content,
705
+ )
706
+
707
+ def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
708
+ if prompt is None or prompt.strip() == "":
709
+ prompt = "Write a detailed caption for this image."
710
+
711
+ sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
712
+
713
+ data_uri = ""
714
+ with open(local_path, "rb") as image_file:
715
+ content_type, encoding = mimetypes.guess_type("_dummy" + extension)
716
+ if content_type is None:
717
+ content_type = "image/jpeg"
718
+ image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
719
+ data_uri = f"data:{content_type};base64,{image_base64}"
720
+
721
+ messages = [
722
+ {
723
+ "role": "user",
724
+ "content": [
725
+ {"type": "text", "text": prompt},
726
+ {
727
+ "type": "image_url",
728
+ "image_url": {
729
+ "url": data_uri,
730
+ },
731
+ },
732
+ ],
733
+ }
734
+ ]
735
+
736
+ response = client.chat.completions.create(model=model, messages=messages)
737
+ return response.choices[0].message.content
738
+
739
+
740
+ class FileConversionException(BaseException):
741
+ pass
742
+
743
+
744
+ class UnsupportedFormatException(BaseException):
745
+ pass
746
+
747
+
748
+ class MarkdownConverter:
749
+ """(In preview) An extremely simple text-based document reader, suitable for LLM use.
750
+ This reader will convert common file-types or webpages to Markdown."""
751
+
752
+ def __init__(
753
+ self,
754
+ requests_session: Optional[requests.Session] = None,
755
+ mlm_client: Optional[Any] = None,
756
+ mlm_model: Optional[Any] = None,
757
+ ):
758
+ if requests_session is None:
759
+ self._requests_session = requests.Session()
760
+ else:
761
+ self._requests_session = requests_session
762
+
763
+ self._mlm_client = mlm_client
764
+ self._mlm_model = mlm_model
765
+
766
+ self._page_converters: List[DocumentConverter] = []
767
+
768
+ # Register converters for successful browsing operations
769
+ # Later registrations are tried first / take higher priority than earlier registrations
770
+ # To this end, the most specific converters should appear below the most generic converters
771
+ self.register_page_converter(PlainTextConverter())
772
+ self.register_page_converter(JavaScriptConverter())
773
+ self.register_page_converter(HtmlConverter())
774
+ self.register_page_converter(WikipediaConverter())
775
+ self.register_page_converter(YouTubeConverter())
776
+ self.register_page_converter(DocxConverter())
777
+ self.register_page_converter(XlsxConverter())
778
+ self.register_page_converter(PptxConverter())
779
+ self.register_page_converter(WavConverter())
780
+ self.register_page_converter(Mp3Converter())
781
+ self.register_page_converter(ImageConverter())
782
+ self.register_page_converter(PdfConverter())
783
+
784
+ def convert(
785
+ self, source: Union[str, requests.Response], **kwargs: Any
786
+ ) -> DocumentConverterResult: # TODO: deal with kwargs
787
+ """
788
+ Args:
789
+ - source: can be a string representing a path or url, or a requests.response object
790
+ - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
791
+ """
792
+
793
+ # Local path or url
794
+ if isinstance(source, str):
795
+ if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"):
796
+ return self.convert_url(source, **kwargs)
797
+ else:
798
+ return self.convert_local(source, **kwargs)
799
+ # Request response
800
+ elif isinstance(source, requests.Response):
801
+ return self.convert_response(source, **kwargs)
802
+
803
+ def convert_local(self, path: str, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
804
+ # Prepare a list of extensions to try (in order of priority)
805
+ ext = kwargs.get("file_extension")
806
+ extensions = [ext] if ext is not None else []
807
+
808
+ # Get extension alternatives from the path and puremagic
809
+ base, ext = os.path.splitext(path)
810
+ self._append_ext(extensions, ext)
811
+ self._append_ext(extensions, self._guess_ext_magic(path))
812
+
813
+ # Convert
814
+ return self._convert(path, extensions, **kwargs)
815
+
816
+ # TODO what should stream's type be?
817
+ def convert_stream(self, stream: Any, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
818
+ # Prepare a list of extensions to try (in order of priority)
819
+ ext = kwargs.get("file_extension")
820
+ extensions = [ext] if ext is not None else []
821
+
822
+ # Save the file locally to a temporary file. It will be deleted before this method exits
823
+ handle, temp_path = tempfile.mkstemp()
824
+ fh = os.fdopen(handle, "wb")
825
+ result = None
826
+ try:
827
+ # Write to the temporary file
828
+ content = stream.read()
829
+ if isinstance(content, str):
830
+ fh.write(content.encode("utf-8"))
831
+ else:
832
+ fh.write(content)
833
+ fh.close()
834
+
835
+ # Use puremagic to check for more extension options
836
+ self._append_ext(extensions, self._guess_ext_magic(temp_path))
837
+
838
+ # Convert
839
+ result = self._convert(temp_path, extensions, **kwargs)
840
+ # Clean up
841
+ finally:
842
+ try:
843
+ fh.close()
844
+ except Exception:
845
+ pass
846
+ os.unlink(temp_path)
847
+
848
+ return result
849
+
850
+ def convert_url(self, url: str, **kwargs: Any) -> DocumentConverterResult: # TODO: fix kwargs type
851
+ # Send a HTTP request to the URL
852
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
853
+ response = self._requests_session.get(url, stream=True, headers={"User-Agent": user_agent})
854
+ response.raise_for_status()
855
+ return self.convert_response(response, **kwargs)
856
+
857
+ def convert_response(
858
+ self, response: requests.Response, **kwargs: Any
859
+ ) -> DocumentConverterResult: # TODO fix kwargs type
860
+ # Prepare a list of extensions to try (in order of priority)
861
+ ext = kwargs.get("file_extension")
862
+ extensions = [ext] if ext is not None else []
863
+
864
+ # Guess from the mimetype
865
+ content_type = response.headers.get("content-type", "").split(";")[0]
866
+ self._append_ext(extensions, mimetypes.guess_extension(content_type))
867
+
868
+ # Read the content disposition if there is one
869
+ content_disposition = response.headers.get("content-disposition", "")
870
+ m = re.search(r"filename=([^;]+)", content_disposition)
871
+ if m:
872
+ base, ext = os.path.splitext(m.group(1).strip("\"'"))
873
+ self._append_ext(extensions, ext)
874
+
875
+ # Read from the extension from the path
876
+ base, ext = os.path.splitext(urlparse(response.url).path)
877
+ self._append_ext(extensions, ext)
878
+
879
+ # Save the file locally to a temporary file. It will be deleted before this method exits
880
+ handle, temp_path = tempfile.mkstemp()
881
+ fh = os.fdopen(handle, "wb")
882
+ result = None
883
+ try:
884
+ # Download the file
885
+ for chunk in response.iter_content(chunk_size=512):
886
+ fh.write(chunk)
887
+ fh.close()
888
+
889
+ # Use puremagic to check for more extension options
890
+ self._append_ext(extensions, self._guess_ext_magic(temp_path))
891
+
892
+ # Convert
893
+ result = self._convert(temp_path, extensions, url=response.url)
894
+ except Exception as e:
895
+ print(f"Error in converting: {e}")
896
+
897
+ # Clean up
898
+ finally:
899
+ try:
900
+ fh.close()
901
+ except Exception:
902
+ pass
903
+ os.unlink(temp_path)
904
+
905
+ return result
906
+
907
+ def _convert(self, local_path: str, extensions: List[Union[str, None]], **kwargs) -> DocumentConverterResult:
908
+ error_trace = ""
909
+ for ext in extensions + [None]: # Try last with no extension
910
+ for converter in self._page_converters:
911
+ _kwargs = copy.deepcopy(kwargs)
912
+
913
+ # Overwrite file_extension appropriately
914
+ if ext is None:
915
+ if "file_extension" in _kwargs:
916
+ del _kwargs["file_extension"]
917
+ else:
918
+ _kwargs.update({"file_extension": ext})
919
+
920
+ # Copy any additional global options
921
+ if "mlm_client" not in _kwargs and self._mlm_client is not None:
922
+ _kwargs["mlm_client"] = self._mlm_client
923
+
924
+ if "mlm_model" not in _kwargs and self._mlm_model is not None:
925
+ _kwargs["mlm_model"] = self._mlm_model
926
+
927
+ # If we hit an error log it and keep trying
928
+ try:
929
+ res = converter.convert(local_path, **_kwargs)
930
+ except Exception:
931
+ error_trace = ("\n\n" + traceback.format_exc()).strip()
932
+
933
+ if res is not None:
934
+ # Normalize the content
935
+ res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)])
936
+ res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
937
+
938
+ # Todo
939
+ return res
940
+
941
+ # If we got this far without success, report any exceptions
942
+ if len(error_trace) > 0:
943
+ raise FileConversionException(
944
+ f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
945
+ )
946
+
947
+ # Nothing can handle it!
948
+ raise UnsupportedFormatException(
949
+ f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
950
+ )
951
+
952
+ def _append_ext(self, extensions, ext):
953
+ """Append a unique non-None, non-empty extension to a list of extensions."""
954
+ if ext is None:
955
+ return
956
+ ext = ext.strip()
957
+ if ext == "":
958
+ return
959
+ # if ext not in extensions:
960
+ if True:
961
+ extensions.append(ext)
962
+
963
+ def _guess_ext_magic(self, path):
964
+ """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
965
+ # Use puremagic to guess
966
+ try:
967
+ guesses = puremagic.magic_file(path)
968
+ if len(guesses) > 0:
969
+ ext = guesses[0].extension.strip()
970
+ if len(ext) > 0:
971
+ return ext
972
+ except FileNotFoundError:
973
+ pass
974
+ except IsADirectoryError:
975
+ pass
976
+ except PermissionError:
977
+ pass
978
+ return None
979
+
980
+ def register_page_converter(self, converter: DocumentConverter) -> None:
981
+ """Register a page text converter."""
982
+ self._page_converters.insert(0, converter)
scripts/reformulator.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
2
+ # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
3
+ import copy
4
+
5
+ from smolagents.models import MessageRole, Model
6
+
7
+
8
+ def prepare_response(original_task: str, inner_messages, reformulation_model: Model) -> str:
9
+ messages = [
10
+ {
11
+ "role": MessageRole.SYSTEM,
12
+ "content": [
13
+ {
14
+ "type": "text",
15
+ "text": f"""Earlier you were asked the following:
16
+
17
+ {original_task}
18
+
19
+ Your team then worked diligently to address that request. Read below a transcript of that conversation:""",
20
+ }
21
+ ],
22
+ }
23
+ ]
24
+
25
+ # The first message just repeats the question, so remove it
26
+ # if len(inner_messages) > 1:
27
+ # del inner_messages[0]
28
+
29
+ # copy them to this context
30
+ try:
31
+ for message in inner_messages:
32
+ if not message.get("content"):
33
+ continue
34
+ message = copy.deepcopy(message)
35
+ message["role"] = MessageRole.USER
36
+ messages.append(message)
37
+ except Exception:
38
+ messages += [{"role": MessageRole.ASSISTANT, "content": str(inner_messages)}]
39
+
40
+ # ask for the final answer
41
+ messages.append(
42
+ {
43
+ "role": MessageRole.USER,
44
+ "content": [
45
+ {
46
+ "type": "text",
47
+ "text": f"""
48
+ Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
49
+
50
+ {original_task}
51
+
52
+ To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
53
+ Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
54
+ ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
55
+ If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise.
56
+ If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
57
+ If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
58
+ If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine'
59
+ """,
60
+ }
61
+ ],
62
+ }
63
+ )
64
+
65
+ response = reformulation_model(messages).content
66
+
67
+ final_answer = response.split("FINAL ANSWER: ")[-1].strip()
68
+ print("> Reformulated answer: ", final_answer)
69
+
70
+ # if "unable to determine" in final_answer.lower():
71
+ # messages.append({"role": MessageRole.ASSISTANT, "content": response })
72
+ # messages.append({"role": MessageRole.USER, "content": [{"type": "text", "text": """
73
+ # I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation.
74
+
75
+ # To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS]
76
+ # Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc.
77
+ # ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
78
+ # If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
79
+ # If you are asked for a string, don't use articles or abbreviations (e.g. cit for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
80
+ # If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
81
+ # """.strip()}]})
82
+
83
+ # response = model(messages).content
84
+ # print("\n>>>Making an educated guess.\n", response)
85
+ # final_answer = response.split("EDUCATED GUESS: ")[-1].strip()
86
+ return final_answer
scripts/report_generator.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Dict
2
+ from smolagents import Tool
3
+
4
+ class HFLinkReportTool(Tool):
5
+ """Generate a single-layout HTML report (cards + counters) from a final textual answer.
6
+ The tool extracts links from the provided text, categorizes them (HF models/datasets/spaces/papers, blogs, repos, videos, news),
7
+ and renders a consistent link report. Always returns a full HTML document (starts with <!DOCTYPE html>)."""
8
+
9
+ name = "hf_links_to_report"
10
+ description = (
11
+ "Create an HTML report from a final answer text. The tool parses links, groups them into categories "
12
+ "(Hugging Face models/datasets/spaces/papers and external resources like blogs/repos/videos/news), and renders cards. "
13
+ "Inputs: final_answer (string, required), query (string, optional), title (string, optional). Returns an HTML document."
14
+ )
15
+ inputs = {
16
+ "final_answer": {"type": "string", "description": "Final answer text containing inline links"},
17
+ "query": {"type": "string", "description": "Original user intent or topic", "nullable": True},
18
+ "title": {"type": "string", "description": "Dashboard title", "nullable": True},
19
+ }
20
+ output_type = "string"
21
+
22
+ def forward(self, final_answer: str, query: Optional[str] = None, title: Optional[str] = None) -> str:
23
+ try:
24
+ import re
25
+ import json as _json
26
+ doc_title = title or "Report"
27
+ query = (query or "").strip()
28
+
29
+ # Extract URLs
30
+ urls = re.findall(r"https?://[^\s)\]]+", final_answer or "")
31
+ # Categorize
32
+ cats = {
33
+ "models": [], "datasets": [], "spaces": [], "papers": [],
34
+ "blogs": [], "repos": [], "videos": [], "news": [], "other": []
35
+ }
36
+ for u in urls:
37
+ low = u.lower()
38
+ if "huggingface.co/" in low:
39
+ # Prefer explicit kinds first to avoid misclassifying /datasets/* as generic owner/repo
40
+ if "/datasets/" in low:
41
+ cats["datasets"].append(u)
42
+ elif "/spaces/" in low:
43
+ cats["spaces"].append(u)
44
+ elif "/papers/" in low:
45
+ cats["papers"].append(u)
46
+ elif "/models/" in low:
47
+ cats["models"].append(u)
48
+ else:
49
+ # Treat bare owner/repo as models only if it is NOT under known sections
50
+ # e.g., huggingface.co/owner/repo → model repo; huggingface.co/blog/... → blog
51
+ m = re.search(r"huggingface\.co/([^/]+)/([^/]+)$", low)
52
+ if m and m.group(1) not in {"datasets", "spaces", "papers", "blog", "learn", "docs", "organizations", "collections"}:
53
+ cats["models"].append(u)
54
+ else:
55
+ cats["blogs"].append(u)
56
+ elif "github.com" in low:
57
+ cats["repos"].append(u)
58
+ elif "youtube.com" in low or "youtu.be" in low:
59
+ cats["videos"].append(u)
60
+ elif any(d in low for d in ["arxiv.org", "medium.com", "towardsdatascience.com", "huggingface.co/blog", "huggingface.co/learn"]):
61
+ cats["blogs"].append(u)
62
+ elif any(d in low for d in ["theverge.com", "techcrunch.com", "venturebeat.com", "wired.com", "mit.edu"]):
63
+ cats["news"].append(u)
64
+ else:
65
+ cats["other"].append(u)
66
+
67
+ def chips_section():
68
+ chips = [
69
+ ("Models", len(cats["models"])),
70
+ ("Datasets", len(cats["datasets"])),
71
+ ("Spaces", len(cats["spaces"])),
72
+ ("Papers", len(cats["papers"])),
73
+ ("Blogs/Docs", len(cats["blogs"])),
74
+ ("Repos", len(cats["repos"])),
75
+ ("Videos", len(cats["videos"])),
76
+ ("News", len(cats["news"]))
77
+ ]
78
+ return "\n".join([f"<div class=stat-chip>{name}: {count}</div>" for name, count in chips])
79
+
80
+ def host_icon(host: str) -> str:
81
+ return ""
82
+
83
+ def card_list(urls: List[str], data_cat: str) -> str:
84
+ items = []
85
+ for u in urls:
86
+ host = re.sub(r"^https?://", "", u).split("/")[0]
87
+ icon = host_icon(host)
88
+ favicon = f"https://www.google.com/s2/favicons?sz=32&domain={host}"
89
+ items.append(
90
+ f"<div class=card data-cat='{data_cat}'>"
91
+ f"<div class=card-title>{icon} <img class=\"fav\" src=\"{favicon}\" alt=\"\"/> <a href='{u}' target=_blank rel=noopener>{u}</a></div>"
92
+ f"<div class=card-subtitle>{host}</div>"
93
+ f"<div class=card-actions><button onclick=\"copyLink('{u}')\">Copy</button></div>"
94
+ "</div>"
95
+ )
96
+ return "\n".join(items)
97
+
98
+ def section(title_text: str, urls: List[str], key: str) -> str:
99
+ if not urls:
100
+ return ""
101
+ return f"<section data-key='{key}'><h2>{title_text}</h2><div class=cards>{card_list(urls, key)}</div></section>"
102
+
103
+ html = f"""<!DOCTYPE html>
104
+ <html lang=\"en\">
105
+ <head>
106
+ <meta charset=\"utf-8\" />
107
+ <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
108
+ <title>{doc_title}</title>
109
+ <style>
110
+ :root {{ --bg:#0b0d12; --fg:#e6e9ef; --muted:#9aa4b2; --card:#121621; --accent:#5ac8fa; }}
111
+ body {{ background:var(--bg); color:var(--fg); font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Inter, Arial, sans-serif; margin:0; padding:24px; }}
112
+ .container {{ max-width: 1200px; margin: 0 auto; }}
113
+ .header {{ display:flex; justify-content:space-between; align-items:center; gap:12px; margin-bottom: 12px; }}
114
+ .title {{ font-size: 22px; margin: 0; }}
115
+ .subtitle {{ color: var(--muted); }}
116
+ .stats {{ display:flex; gap:10px; flex-wrap:wrap; margin: 8px 0 18px; }}
117
+ .stat-chip {{ background: var(--card); border: 1px solid rgba(255,255,255,0.08); border-radius: 999px; padding: 6px 10px; font-size: 12px; color: var(--muted); }}
118
+ h2 {{ font-size: 16px; margin: 18px 0 8px; color: var(--accent); }}
119
+ .cards {{ display: grid; grid-template-columns: repeat(auto-fill, minmax(280px,1fr)); gap: 12px; }}
120
+ .card {{ background: var(--card); border: 1px solid rgba(255,255,255,0.06); border-radius: 10px; padding: 12px; }}
121
+ .card-title {{ font-weight: 600; margin-bottom: 4px; overflow-wrap:anywhere; }}
122
+ .card-subtitle {{ color: var(--muted); font-size: 12px; }}
123
+ .answer {{ line-height:1.55; color:#d2d7df; }}
124
+ .card-actions button {{ background:#1f2937;color:#e5e7eb;border:1px solid rgba(255,255,255,0.08);border-radius:6px;padding:4px 8px;cursor:pointer;font-size:12px; }}
125
+ .fav {{ width:14px; height:14px; vertical-align:middle; margin-right:6px; border-radius:4px; }}
126
+ .warn {{ margin-left:6px; cursor: help; }}
127
+ </style>
128
+ <script src=\"https://cdn.jsdelivr.net/npm/marked/marked.min.js\"></script>
129
+ <script src=\"https://cdn.jsdelivr.net/npm/[email protected]/dist/purify.min.js\"></script>
130
+ </head>
131
+ <body>
132
+ <div class=\"container\">{('<div class=\\"header\\"><div><div class=\\"title\\">' + title + '</div></div></div>') if title else ''}
133
+ <h2>You may be interested <span class=\"warn\" title=\"Links may be AI‑generated and might not resolve.\">⚠️</span></h2>
134
+ <div class=\"stats\">{chips_section()}</div>
135
+ {section('Models', cats['models'], 'models')}
136
+ {section('Datasets', cats['datasets'], 'datasets')}
137
+ {section('Spaces', cats['spaces'], 'spaces')}
138
+ {section('Papers', cats['papers'], 'papers')}
139
+ {section('Blogs / Docs', cats['blogs'], 'blogs')}
140
+ {section('Repositories', cats['repos'], 'repos')}
141
+ {section('Videos', cats['videos'], 'videos')}
142
+ {section('News', cats['news'], 'news')}
143
+ {section('Other', cats['other'], 'other')}
144
+ </div>
145
+ <script>
146
+ function copyLink(url){{ try{{navigator.clipboard && navigator.clipboard.writeText(url);}}catch(e){{}} }}
147
+ </script>
148
+ </body>
149
+ </html>
150
+ """
151
+ return html
152
+ except Exception as e:
153
+ return f"<!DOCTYPE html><html><body><pre>Error generating report: {str(e)}</pre></body></html>"
scripts/run_agents.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import shutil
4
+ import textwrap
5
+ from pathlib import Path
6
+
7
+ # import tqdm.asyncio
8
+ from smolagents.utils import AgentError
9
+
10
+
11
+ def serialize_agent_error(obj):
12
+ if isinstance(obj, AgentError):
13
+ return {"error_type": obj.__class__.__name__, "message": obj.message}
14
+ else:
15
+ return str(obj)
16
+
17
+
18
+ def get_image_description(file_name: str, question: str, visual_inspection_tool) -> str:
19
+ prompt = f"""Write a caption of 5 sentences for this image. Pay special attention to any details that might be useful for someone answering the following question:
20
+ {question}. But do not try to answer the question directly!
21
+ Do not add any information that is not present in the image."""
22
+ return visual_inspection_tool(image_path=file_name, question=prompt)
23
+
24
+
25
+ def get_document_description(file_path: str, question: str, document_inspection_tool) -> str:
26
+ prompt = f"""Write a caption of 5 sentences for this document. Pay special attention to any details that might be useful for someone answering the following question:
27
+ {question}. But do not try to answer the question directly!
28
+ Do not add any information that is not present in the document."""
29
+ return document_inspection_tool.forward_initial_exam_mode(file_path=file_path, question=prompt)
30
+
31
+
32
+ def get_single_file_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
33
+ file_extension = file_path.split(".")[-1]
34
+ if file_extension in ["png", "jpg", "jpeg"]:
35
+ file_description = f" - Attached image: {file_path}"
36
+ file_description += (
37
+ f"\n -> Image description: {get_image_description(file_path, question, visual_inspection_tool)}"
38
+ )
39
+ return file_description
40
+ elif file_extension in ["pdf", "xls", "xlsx", "docx", "doc", "xml"]:
41
+ file_description = f" - Attached document: {file_path}"
42
+ image_path = file_path.split(".")[0] + ".png"
43
+ if os.path.exists(image_path):
44
+ description = get_image_description(image_path, question, visual_inspection_tool)
45
+ else:
46
+ description = get_document_description(file_path, question, document_inspection_tool)
47
+ file_description += f"\n -> File description: {description}"
48
+ return file_description
49
+ elif file_extension in ["mp3", "m4a", "wav"]:
50
+ return f" - Attached audio: {file_path}"
51
+ else:
52
+ return f" - Attached file: {file_path}"
53
+
54
+
55
+ def get_zip_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
56
+ folder_path = file_path.replace(".zip", "")
57
+ os.makedirs(folder_path, exist_ok=True)
58
+ shutil.unpack_archive(file_path, folder_path)
59
+
60
+ prompt_use_files = ""
61
+ for root, dirs, files in os.walk(folder_path):
62
+ for file in files:
63
+ file_path = os.path.join(root, file)
64
+ prompt_use_files += "\n" + textwrap.indent(
65
+ get_single_file_description(file_path, question, visual_inspection_tool, document_inspection_tool),
66
+ prefix=" ",
67
+ )
68
+ return prompt_use_files
69
+
70
+
71
+ def get_tasks_to_run(data, total: int, base_filename: Path, tasks_ids: list[int]):
72
+ f = base_filename.parent / f"{base_filename.stem}_answers.jsonl"
73
+ done = set()
74
+ if f.exists():
75
+ with open(f, encoding="utf-8") as fh:
76
+ done = {json.loads(line)["task_id"] for line in fh if line.strip()}
77
+
78
+ tasks = []
79
+ for i in range(total):
80
+ task_id = int(data[i]["task_id"])
81
+ if task_id not in done:
82
+ if tasks_ids is not None:
83
+ if task_id in tasks_ids:
84
+ tasks.append(data[i])
85
+ else:
86
+ tasks.append(data[i])
87
+ return tasks
scripts/text_inspector_tool.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ import os
3
+
4
+ from smolagents import Tool
5
+ from smolagents.models import MessageRole, Model
6
+
7
+ from .mdconvert import MarkdownConverter
8
+
9
+
10
+ class TextInspectorTool(Tool):
11
+ name = "inspect_file_as_text"
12
+ description = """
13
+ You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it.
14
+ This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx", ".mjs", ".js"], and all other types of text files. IT DOES NOT HANDLE IMAGES."""
15
+
16
+ inputs = {
17
+ "file_path": {
18
+ "description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT use this tool for an HTML webpage: use the web_search tool instead!",
19
+ "type": "string",
20
+ },
21
+ "question": {
22
+ "description": "[Optional]: Your question, as a natural language sentence. Provide as much context as possible. Do not pass this parameter if you just want to directly return the content of the file.",
23
+ "type": "string",
24
+ "nullable": True,
25
+ },
26
+ }
27
+ output_type = "string"
28
+ md_converter = MarkdownConverter()
29
+
30
+ def __init__(self, model: Model, text_limit: int):
31
+ super().__init__()
32
+ self.model = model
33
+ self.text_limit = text_limit
34
+
35
+ def forward_initial_exam_mode(self, file_path, question):
36
+ try:
37
+ # Only allow reading files from uploads directory
38
+ uploads_dir = os.path.abspath(os.path.join(os.getcwd(), "uploads"))
39
+ candidate_path = os.path.abspath(file_path)
40
+ if not candidate_path.startswith(uploads_dir + os.sep):
41
+ # Fallback to uploads/<basename>
42
+ candidate_path = os.path.join(uploads_dir, os.path.basename(file_path))
43
+
44
+ result = self.md_converter.convert(candidate_path)
45
+
46
+ if file_path[-4:] in [".png", ".jpg"]:
47
+ raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
48
+
49
+ if ".zip" in file_path:
50
+ return result.text_content
51
+
52
+ if not question:
53
+ return result.text_content
54
+
55
+ if len(result.text_content) < 4000:
56
+ return "Document content: " + result.text_content
57
+
58
+ # For larger files, just return the content without model processing to avoid freezing
59
+ return f"Document title: {result.title}\n\nDocument content:\n{result.text_content[:self.text_limit]}"
60
+
61
+ except Exception as e:
62
+ return f"Error reading file '{file_path}': {str(e)}. Access is restricted to files uploaded via the interface."
63
+
64
+ def forward(self, file_path, question: Optional[str] = None) -> str:
65
+ try:
66
+ # Only allow reading files from uploads directory
67
+ uploads_dir = os.path.abspath(os.path.join(os.getcwd(), "uploads"))
68
+ candidate_path = os.path.abspath(file_path)
69
+ if not candidate_path.startswith(uploads_dir + os.sep):
70
+ # Fallback to uploads/<basename>
71
+ candidate_path = os.path.join(uploads_dir, os.path.basename(file_path))
72
+
73
+ result = self.md_converter.convert(candidate_path)
74
+
75
+ if file_path[-4:] in [".png", ".jpg"]:
76
+ raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
77
+
78
+ if ".zip" in file_path:
79
+ return result.text_content
80
+
81
+ if not question:
82
+ return result.text_content
83
+
84
+ # For questions, return the content with a note about the question
85
+ return f"Question: {question}\n\nDocument title: {result.title}\n\nDocument content:\n{result.text_content[:self.text_limit]}"
86
+
87
+ except Exception as e:
88
+ return f"Error reading file '{file_path}': {str(e)}. Access is restricted to files uploaded via the interface."
scripts/text_web_browser.py ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
2
+ # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
3
+ import mimetypes
4
+ import os
5
+ import pathlib
6
+ import re
7
+ import time
8
+ import uuid
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
10
+ from urllib.parse import unquote, urljoin, urlparse
11
+
12
+ import pathvalidate
13
+ import requests
14
+ from serpapi import GoogleSearch
15
+ # from serpapi.google_search import GoogleSearch
16
+
17
+ from smolagents import Tool
18
+
19
+ from .cookies import COOKIES
20
+ from .mdconvert import FileConversionException, MarkdownConverter, UnsupportedFormatException
21
+
22
+
23
+ class SimpleTextBrowser:
24
+ """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""
25
+
26
+ def __init__(
27
+ self,
28
+ start_page: Optional[str] = None,
29
+ viewport_size: Optional[int] = 1024 * 8,
30
+ downloads_folder: Optional[Union[str, None]] = None,
31
+ serpapi_key: Optional[Union[str, None]] = None,
32
+ request_kwargs: Optional[Union[Dict[str, Any], None]] = None,
33
+ ):
34
+ self.start_page: str = start_page if start_page else "about:blank"
35
+ self.viewport_size = viewport_size # Applies only to the standard uri types
36
+ self.downloads_folder = downloads_folder
37
+ self.history: List[Tuple[str, float]] = list()
38
+ self.page_title: Optional[str] = None
39
+ self.viewport_current_page = 0
40
+ self.viewport_pages: List[Tuple[int, int]] = list()
41
+ self.set_address(self.start_page)
42
+ self.serpapi_key = serpapi_key
43
+ self.request_kwargs = request_kwargs
44
+ self.request_kwargs["cookies"] = COOKIES
45
+ self._mdconvert = MarkdownConverter()
46
+ self._page_content: str = ""
47
+
48
+ self._find_on_page_query: Union[str, None] = None
49
+ self._find_on_page_last_result: Union[int, None] = None # Location of the last result
50
+
51
+ @property
52
+ def address(self) -> str:
53
+ """Return the address of the current page."""
54
+ return self.history[-1][0]
55
+
56
+ def set_address(self, uri_or_path: str, filter_year: Optional[int] = None) -> None:
57
+ # TODO: Handle anchors
58
+ self.history.append((uri_or_path, time.time()))
59
+
60
+ # Handle special URIs
61
+ if uri_or_path == "about:blank":
62
+ self._set_page_content("")
63
+ elif uri_or_path.startswith("google:"):
64
+ self._serpapi_search(uri_or_path[len("google:") :].strip(), filter_year=filter_year)
65
+ else:
66
+ if (
67
+ not uri_or_path.startswith("http:")
68
+ and not uri_or_path.startswith("https:")
69
+ and not uri_or_path.startswith("file:")
70
+ ):
71
+ if len(self.history) > 1:
72
+ prior_address = self.history[-2][0]
73
+ uri_or_path = urljoin(prior_address, uri_or_path)
74
+ # Update the address with the fully-qualified path
75
+ self.history[-1] = (uri_or_path, self.history[-1][1])
76
+ self._fetch_page(uri_or_path)
77
+
78
+ self.viewport_current_page = 0
79
+ self.find_on_page_query = None
80
+ self.find_on_page_viewport = None
81
+
82
+ @property
83
+ def viewport(self) -> str:
84
+ """Return the content of the current viewport."""
85
+ bounds = self.viewport_pages[self.viewport_current_page]
86
+ return self.page_content[bounds[0] : bounds[1]]
87
+
88
+ @property
89
+ def page_content(self) -> str:
90
+ """Return the full contents of the current page."""
91
+ return self._page_content
92
+
93
+ def _set_page_content(self, content: str) -> None:
94
+ """Sets the text content of the current page."""
95
+ self._page_content = content
96
+ self._split_pages()
97
+ if self.viewport_current_page >= len(self.viewport_pages):
98
+ self.viewport_current_page = len(self.viewport_pages) - 1
99
+
100
+ def page_down(self) -> None:
101
+ self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)
102
+
103
+ def page_up(self) -> None:
104
+ self.viewport_current_page = max(self.viewport_current_page - 1, 0)
105
+
106
+ def find_on_page(self, query: str) -> Union[str, None]:
107
+ """Searches for the query from the current viewport forward, looping back to the start if necessary."""
108
+
109
+ # Did we get here via a previous find_on_page search with the same query?
110
+ # If so, map to find_next
111
+ if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result:
112
+ return self.find_next()
113
+
114
+ # Ok it's a new search start from the current viewport
115
+ self._find_on_page_query = query
116
+ viewport_match = self._find_next_viewport(query, self.viewport_current_page)
117
+ if viewport_match is None:
118
+ self._find_on_page_last_result = None
119
+ return None
120
+ else:
121
+ self.viewport_current_page = viewport_match
122
+ self._find_on_page_last_result = viewport_match
123
+ return self.viewport
124
+
125
+ def find_next(self) -> Union[str, None]:
126
+ """Scroll to the next viewport that matches the query"""
127
+
128
+ if self._find_on_page_query is None:
129
+ return None
130
+
131
+ starting_viewport = self._find_on_page_last_result
132
+ if starting_viewport is None:
133
+ starting_viewport = 0
134
+ else:
135
+ starting_viewport += 1
136
+ if starting_viewport >= len(self.viewport_pages):
137
+ starting_viewport = 0
138
+
139
+ viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport)
140
+ if viewport_match is None:
141
+ self._find_on_page_last_result = None
142
+ return None
143
+ else:
144
+ self.viewport_current_page = viewport_match
145
+ self._find_on_page_last_result = viewport_match
146
+ return self.viewport
147
+
148
+ def _find_next_viewport(self, query: str, starting_viewport: int) -> Union[int, None]:
149
+ """Search for matches between the starting viewport looping when reaching the end."""
150
+
151
+ if query is None:
152
+ return None
153
+
154
+ # Normalize the query, and convert to a regular expression
155
+ nquery = re.sub(r"\*", "__STAR__", query)
156
+ nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " "
157
+ nquery = nquery.replace(" __STAR__ ", "__STAR__ ") # Merge isolated stars with prior word
158
+ nquery = nquery.replace("__STAR__", ".*").lower()
159
+
160
+ if nquery.strip() == "":
161
+ return None
162
+
163
+ idxs = list()
164
+ idxs.extend(range(starting_viewport, len(self.viewport_pages)))
165
+ idxs.extend(range(0, starting_viewport))
166
+
167
+ for i in idxs:
168
+ bounds = self.viewport_pages[i]
169
+ content = self.page_content[bounds[0] : bounds[1]]
170
+
171
+ # TODO: Remove markdown links and images
172
+ ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " "
173
+ if re.search(nquery, ncontent):
174
+ return i
175
+
176
+ return None
177
+
178
+ def visit_page(self, path_or_uri: str, filter_year: Optional[int] = None) -> str:
179
+ """Update the address, visit the page, and return the content of the viewport."""
180
+ self.set_address(path_or_uri, filter_year=filter_year)
181
+ return self.viewport
182
+
183
+ def _split_pages(self) -> None:
184
+ # Do not split search results
185
+ if self.address.startswith("google:"):
186
+ self.viewport_pages = [(0, len(self._page_content))]
187
+ return
188
+
189
+ # Handle empty pages
190
+ if len(self._page_content) == 0:
191
+ self.viewport_pages = [(0, 0)]
192
+ return
193
+
194
+ # Break the viewport into pages
195
+ self.viewport_pages = []
196
+ start_idx = 0
197
+ while start_idx < len(self._page_content):
198
+ end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator]
199
+ # Adjust to end on a space
200
+ while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
201
+ end_idx += 1
202
+ self.viewport_pages.append((start_idx, end_idx))
203
+ start_idx = end_idx
204
+
205
+ def _serpapi_search(self, query: str, filter_year: Optional[int] = None) -> None:
206
+ if self.serpapi_key is None:
207
+ raise ValueError("Missing SerpAPI key.")
208
+
209
+ params = {
210
+ "engine": "google",
211
+ "q": query,
212
+ "api_key": self.serpapi_key,
213
+ }
214
+ if filter_year is not None:
215
+ params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}"
216
+
217
+ search = GoogleSearch(params)
218
+ results = search.get_dict()
219
+ self.page_title = f"{query} - Search"
220
+ if "organic_results" not in results.keys():
221
+ raise Exception(f"No results found for query: '{query}'. Use a less specific query.")
222
+ if len(results["organic_results"]) == 0:
223
+ year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
224
+ self._set_page_content(
225
+ f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter."
226
+ )
227
+ return
228
+
229
+ def _prev_visit(url):
230
+ for i in range(len(self.history) - 1, -1, -1):
231
+ if self.history[i][0] == url:
232
+ return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
233
+ return ""
234
+
235
+ web_snippets: List[str] = list()
236
+ idx = 0
237
+ if "organic_results" in results:
238
+ for page in results["organic_results"]:
239
+ idx += 1
240
+ date_published = ""
241
+ if "date" in page:
242
+ date_published = "\nDate published: " + page["date"]
243
+
244
+ source = ""
245
+ if "source" in page:
246
+ source = "\nSource: " + page["source"]
247
+
248
+ snippet = ""
249
+ if "snippet" in page:
250
+ snippet = "\n" + page["snippet"]
251
+
252
+ redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{_prev_visit(page['link'])}{snippet}"
253
+
254
+ redacted_version = redacted_version.replace("Your browser can't play this video.", "")
255
+ web_snippets.append(redacted_version)
256
+
257
+ content = (
258
+ f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n"
259
+ + "\n\n".join(web_snippets)
260
+ )
261
+
262
+ self._set_page_content(content)
263
+
264
+ def _fetch_page(self, url: str) -> None:
265
+ download_path = ""
266
+ try:
267
+ if url.startswith("file://"):
268
+ download_path = os.path.normcase(os.path.normpath(unquote(url[7:])))
269
+ res = self._mdconvert.convert_local(download_path)
270
+ self.page_title = res.title
271
+ self._set_page_content(res.text_content)
272
+ else:
273
+ # Prepare the request parameters
274
+ request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
275
+ request_kwargs["stream"] = True
276
+
277
+ # Send a HTTP request to the URL
278
+ response = requests.get(url, **request_kwargs)
279
+ response.raise_for_status()
280
+
281
+ # If the HTTP request was successful
282
+ content_type = response.headers.get("content-type", "")
283
+
284
+ # Text or HTML
285
+ if "text/" in content_type.lower():
286
+ res = self._mdconvert.convert_response(response)
287
+ self.page_title = res.title
288
+ self._set_page_content(res.text_content)
289
+ # A download
290
+ else:
291
+ # Try producing a safe filename
292
+ fname = None
293
+ download_path = None
294
+ try:
295
+ fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
296
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
297
+
298
+ suffix = 0
299
+ while os.path.exists(download_path) and suffix < 1000:
300
+ suffix += 1
301
+ base, ext = os.path.splitext(fname)
302
+ new_fname = f"{base}__{suffix}{ext}"
303
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname))
304
+
305
+ except NameError:
306
+ pass
307
+
308
+ # No suitable name, so make one
309
+ if fname is None:
310
+ extension = mimetypes.guess_extension(content_type)
311
+ if extension is None:
312
+ extension = ".download"
313
+ fname = str(uuid.uuid4()) + extension
314
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
315
+
316
+ # Open a file for writing
317
+ with open(download_path, "wb") as fh:
318
+ for chunk in response.iter_content(chunk_size=512):
319
+ fh.write(chunk)
320
+
321
+ # Render it
322
+ local_uri = pathlib.Path(download_path).as_uri()
323
+ self.set_address(local_uri)
324
+
325
+ except UnsupportedFormatException as e:
326
+ print(e)
327
+ self.page_title = ("Download complete.",)
328
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
329
+ except FileConversionException as e:
330
+ print(e)
331
+ self.page_title = ("Download complete.",)
332
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
333
+ except FileNotFoundError:
334
+ self.page_title = "Error 404"
335
+ self._set_page_content(f"## Error 404\n\nFile not found: {download_path}")
336
+ except requests.exceptions.RequestException as request_exception:
337
+ try:
338
+ self.page_title = f"Error {response.status_code}"
339
+
340
+ # If the error was rendered in HTML we might as well render it
341
+ content_type = response.headers.get("content-type", "")
342
+ if content_type is not None and "text/html" in content_type.lower():
343
+ res = self._mdconvert.convert(response)
344
+ self.page_title = f"Error {response.status_code}"
345
+ self._set_page_content(f"## Error {response.status_code}\n\n{res.text_content}")
346
+ else:
347
+ text = ""
348
+ for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
349
+ text += chunk
350
+ self.page_title = f"Error {response.status_code}"
351
+ self._set_page_content(f"## Error {response.status_code}\n\n{text}")
352
+ except NameError:
353
+ self.page_title = "Error"
354
+ self._set_page_content(f"## Error\n\n{str(request_exception)}")
355
+
356
+ def _state(self) -> Tuple[str, str]:
357
+ header = f"Address: {self.address}\n"
358
+ if self.page_title is not None:
359
+ header += f"Title: {self.page_title}\n"
360
+
361
+ current_page = self.viewport_current_page
362
+ total_pages = len(self.viewport_pages)
363
+
364
+ address = self.address
365
+ for i in range(len(self.history) - 2, -1, -1): # Start from the second last
366
+ if self.history[i][0] == address:
367
+ header += f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
368
+ break
369
+
370
+ header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n"
371
+ return (header, self.viewport)
372
+
373
+
374
+ class SearchInformationTool(Tool):
375
+ name = "web_search"
376
+ description = "Perform a web search query (think a google search) and returns the search results."
377
+ inputs = {"query": {"type": "string", "description": "The web search query to perform."}}
378
+ inputs["filter_year"] = {
379
+ "type": "string",
380
+ "description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!",
381
+ "nullable": True,
382
+ }
383
+ output_type = "string"
384
+
385
+ def __init__(self, browser):
386
+ super().__init__()
387
+ self.browser = browser
388
+
389
+ def forward(self, query: str, filter_year: Optional[int] = None) -> str:
390
+ self.browser.visit_page(f"google: {query}", filter_year=filter_year)
391
+ header, content = self.browser._state()
392
+ return header.strip() + "\n=======================\n" + content
393
+
394
+
395
+ class VisitTool(Tool):
396
+ name = "visit_page"
397
+ description = "Visit a webpage at a given URL and return its text. Given a url to a YouTube video, this returns the transcript."
398
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webapge to visit."}}
399
+ output_type = "string"
400
+
401
+ def __init__(self, browser):
402
+ super().__init__()
403
+ self.browser = browser
404
+
405
+ def forward(self, url: str) -> str:
406
+ self.browser.visit_page(url)
407
+ header, content = self.browser._state()
408
+ return header.strip() + "\n=======================\n" + content
409
+
410
+
411
+ class DownloadTool(Tool):
412
+ name = "download_file"
413
+ description = """
414
+ Download a file at a given URL. The file should be of this format: [".xlsx", ".pptx", ".wav", ".mp3", ".png", ".docx"]
415
+ After using this tool, for further inspection of this page you should return the download path to your manager via final_answer, and they will be able to inspect it.
416
+ DO NOT use this tool for .pdf or .txt or .htm files: for these types of files use visit_page with the file url instead."""
417
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the file to be downloaded."}}
418
+ output_type = "string"
419
+
420
+ def __init__(self, browser):
421
+ super().__init__()
422
+ self.browser = browser
423
+
424
+ def forward(self, url: str) -> str:
425
+ if "arxiv" in url:
426
+ url = url.replace("abs", "pdf")
427
+ response = requests.get(url)
428
+ content_type = response.headers.get("content-type", "")
429
+ extension = mimetypes.guess_extension(content_type)
430
+ if extension and isinstance(extension, str):
431
+ new_path = f"./downloads/file{extension}"
432
+ else:
433
+ new_path = "./downloads/file.object"
434
+
435
+ with open(new_path, "wb") as f:
436
+ f.write(response.content)
437
+
438
+ if "pdf" in extension or "txt" in extension or "htm" in extension:
439
+ raise Exception("Do not use this tool for pdf or txt or html files: use visit_page instead.")
440
+
441
+ return f"File was downloaded and saved under path {new_path}."
442
+
443
+
444
+ class ArchiveSearchTool(Tool):
445
+ name = "find_archived_url"
446
+ description = "Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date."
447
+ inputs = {
448
+ "url": {"type": "string", "description": "The url you need the archive for."},
449
+ "date": {
450
+ "type": "string",
451
+ "description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'.",
452
+ },
453
+ }
454
+ output_type = "string"
455
+
456
+ def __init__(self, browser):
457
+ super().__init__()
458
+ self.browser = browser
459
+
460
+ def forward(self, url, date) -> str:
461
+ no_timestamp_url = f"https://archive.org/wayback/available?url={url}"
462
+ archive_url = no_timestamp_url + f"&timestamp={date}"
463
+ response = requests.get(archive_url).json()
464
+ response_notimestamp = requests.get(no_timestamp_url).json()
465
+ if "archived_snapshots" in response and "closest" in response["archived_snapshots"]:
466
+ closest = response["archived_snapshots"]["closest"]
467
+ print("Archive found!", closest)
468
+
469
+ elif "archived_snapshots" in response_notimestamp and "closest" in response_notimestamp["archived_snapshots"]:
470
+ closest = response_notimestamp["archived_snapshots"]["closest"]
471
+ print("Archive found!", closest)
472
+ else:
473
+ raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
474
+ target_url = closest["url"]
475
+ self.browser.visit_page(target_url)
476
+ header, content = self.browser._state()
477
+ return (
478
+ f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n"
479
+ + header.strip()
480
+ + "\n=======================\n"
481
+ + content
482
+ )
483
+
484
+
485
+ class PageUpTool(Tool):
486
+ name = "page_up"
487
+ description = "Scroll the viewport UP one page-length in the current webpage and return the new viewport content."
488
+ inputs = {}
489
+ output_type = "string"
490
+
491
+ def __init__(self, browser):
492
+ super().__init__()
493
+ self.browser = browser
494
+
495
+ def forward(self) -> str:
496
+ self.browser.page_up()
497
+ header, content = self.browser._state()
498
+ return header.strip() + "\n=======================\n" + content
499
+
500
+
501
+ class PageDownTool(Tool):
502
+ name = "page_down"
503
+ description = (
504
+ "Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content."
505
+ )
506
+ inputs = {}
507
+ output_type = "string"
508
+
509
+ def __init__(self, browser):
510
+ super().__init__()
511
+ self.browser = browser
512
+
513
+ def forward(self) -> str:
514
+ self.browser.page_down()
515
+ header, content = self.browser._state()
516
+ return header.strip() + "\n=======================\n" + content
517
+
518
+
519
+ class FinderTool(Tool):
520
+ name = "find_on_page_ctrl_f"
521
+ description = "Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F."
522
+ inputs = {
523
+ "search_string": {
524
+ "type": "string",
525
+ "description": "The string to search for on the page. This search string supports wildcards like '*'",
526
+ }
527
+ }
528
+ output_type = "string"
529
+
530
+ def __init__(self, browser):
531
+ super().__init__()
532
+ self.browser = browser
533
+
534
+ def forward(self, search_string: str) -> str:
535
+ find_result = self.browser.find_on_page(search_string)
536
+ header, content = self.browser._state()
537
+
538
+ if find_result is None:
539
+ return (
540
+ header.strip()
541
+ + f"\n=======================\nThe search string '{search_string}' was not found on this page."
542
+ )
543
+ else:
544
+ return header.strip() + "\n=======================\n" + content
545
+
546
+
547
+ class FindNextTool(Tool):
548
+ name = "find_next"
549
+ description = "Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search."
550
+ inputs = {}
551
+ output_type = "string"
552
+
553
+ def __init__(self, browser):
554
+ super().__init__()
555
+ self.browser = browser
556
+
557
+ def forward(self) -> str:
558
+ find_result = self.browser.find_next()
559
+ header, content = self.browser._state()
560
+
561
+ if find_result is None:
562
+ return header.strip() + "\n=======================\nThe search string was not found on this page."
563
+ else:
564
+ return header.strip() + "\n=======================\n" + content
scripts/visual_qa.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import mimetypes
4
+ import os
5
+ import uuid
6
+ from io import BytesIO
7
+ from typing import Optional
8
+
9
+ import requests
10
+ from dotenv import load_dotenv
11
+ from PIL import Image
12
+
13
+ from smolagents import Tool, tool
14
+
15
+
16
+ load_dotenv(override=True)
17
+
18
+
19
+ def encode_image(image_path):
20
+ if image_path.startswith("http"):
21
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
22
+ request_kwargs = {
23
+ "headers": {"User-Agent": user_agent},
24
+ "stream": True,
25
+ }
26
+
27
+ # Send a HTTP request to the URL
28
+ response = requests.get(image_path, **request_kwargs)
29
+ response.raise_for_status()
30
+ content_type = response.headers.get("content-type", "")
31
+
32
+ extension = mimetypes.guess_extension(content_type)
33
+ if extension is None:
34
+ extension = ".download"
35
+
36
+ fname = str(uuid.uuid4()) + extension
37
+ download_path = os.path.abspath(os.path.join("downloads", fname))
38
+
39
+ with open(download_path, "wb") as fh:
40
+ for chunk in response.iter_content(chunk_size=512):
41
+ fh.write(chunk)
42
+
43
+ image_path = download_path
44
+
45
+ with open(image_path, "rb") as image_file:
46
+ return base64.b64encode(image_file.read()).decode("utf-8")
47
+
48
+
49
+ def resize_image(image_path):
50
+ img = Image.open(image_path)
51
+ width, height = img.size
52
+ img = img.resize((int(width / 2), int(height / 2)))
53
+ new_image_path = f"resized_{image_path}"
54
+ img.save(new_image_path)
55
+ return new_image_path
56
+
57
+
58
+ @tool
59
+ def visualizer(image_path: str, question: Optional[str] = None) -> str:
60
+ """A tool that can answer questions about attached images.
61
+
62
+ Args:
63
+ image_path: The path to the image on which to answer the question. This should be a local path to downloaded image.
64
+ question: The question to answer.
65
+ """
66
+ if not isinstance(image_path, str):
67
+ raise Exception("You should provide at least `image_path` string argument to this tool!")
68
+
69
+ add_note = False
70
+ if not question:
71
+ add_note = True
72
+ question = "Please write a detailed caption for this image."
73
+
74
+ mime_type, _ = mimetypes.guess_type(image_path)
75
+ base64_image = encode_image(image_path)
76
+
77
+ # Configuración para Ollama
78
+ model_id = os.getenv("MODEL_ID", "qwen2.5-coder:3b")
79
+ api_base = os.getenv("OPENAI_API_BASE", "http://localhost:11434/v1")
80
+ api_key = os.getenv("OPENAI_API_KEY", "ollama")
81
+
82
+ headers = {
83
+ "Content-Type": "application/json",
84
+ "Authorization": f"Bearer {api_key}"
85
+ }
86
+
87
+ payload = {
88
+ "model": model_id,
89
+ "messages": [
90
+ {
91
+ "role": "user",
92
+ "content": [
93
+ {"type": "text", "text": question},
94
+ {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}},
95
+ ],
96
+ }
97
+ ],
98
+ "max_tokens": 1000,
99
+ }
100
+
101
+ try:
102
+ response = requests.post(f"{api_base}/chat/completions", headers=headers, json=payload)
103
+ response.raise_for_status()
104
+ output = response.json()["choices"][0]["message"]["content"]
105
+ except Exception as e:
106
+ print(f"Error processing image: {str(e)}")
107
+ if "Payload Too Large" in str(e):
108
+ new_image_path = resize_image(image_path)
109
+ base64_image = encode_image(new_image_path)
110
+ payload["messages"][0]["content"][1]["image_url"]["url"] = f"data:{mime_type};base64,{base64_image}"
111
+ response = requests.post(f"{api_base}/chat/completions", headers=headers, json=payload)
112
+ response.raise_for_status()
113
+ output = response.json()["choices"][0]["message"]["content"]
114
+ else:
115
+ raise Exception(f"Error processing image: {str(e)}")
116
+
117
+ if add_note:
118
+ output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
119
+
120
+ return output
set-env.bat ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ REM siliconflow rate limited, wont work for deep-research
2
+ REM set OPENAI_API_BASE=https://api.siliconflow.cn/v1
3
+ REM set OPENAI_API_KEY=%SILICONFLOW_API_KEY%
4
+ REM set MODEL_ID=openai/deepseek-ai/DeepSeek-V3
5
+
6
+ set OPENAI_API_BASE=https://litellm.dattw.eu.org/v1
7
+ set OPENAI_API_KEY=%LITELLM_API_KEY%
8
+ set MODEL_ID=Qwen/Qwen3-Coder-480B-A35B-Instruct
9
+
10
+ REM set SERPAPI_API_KEY=b84...
11
+ REM set HF_TOKEN=hf_yC...
tests/hf_tools_tests_output_20250822_034011.txt ADDED
@@ -0,0 +1,2425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hugging Face Tools Test Run — 20250822_034011
2
+ ================================================================================
3
+
4
+ === Running test_hf_models_search.py ===
5
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_models_search.py
6
+ --- INPUT (snippet) ---
7
+ tool.forward(
8
+ query="stable diffusion",
9
+ task="text-to-image",
10
+ sort="downloads",
11
+ direction="descending",
12
+ limit=5,
13
+ )
14
+ try:
15
+ data = json.loads(result_json_str)
16
+ print(json.dumps(data, indent=2, ensure_ascii=False))
17
+ except Exception:
18
+ print(result_json_str)
19
+
20
+
21
+ if __name__ == "__main__":
22
+ main()
23
+
24
+
25
+ --- STDOUT ---
26
+ {
27
+ "results": [
28
+ {
29
+ "type": "model",
30
+ "id": "stabilityai/stable-diffusion-xl-base-1.0",
31
+ "owner": "stabilityai",
32
+ "url": "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0",
33
+ "description": "",
34
+ "tags": [
35
+ "diffusers",
36
+ "onnx",
37
+ "safetensors",
38
+ "text-to-image",
39
+ "stable-diffusion",
40
+ "arxiv:2307.01952",
41
+ "arxiv:2211.01324",
42
+ "arxiv:2108.01073",
43
+ "arxiv:2112.10752",
44
+ "license:openrail++",
45
+ "autotrain_compatible",
46
+ "endpoints_compatible",
47
+ "diffusers:StableDiffusionXLPipeline",
48
+ "region:us"
49
+ ],
50
+ "task": "text-to-image",
51
+ "likes": 6862,
52
+ "downloads": 2281071,
53
+ "updatedAt": null,
54
+ "visibility": "public",
55
+ "access": "accessible"
56
+ },
57
+ {
58
+ "type": "model",
59
+ "id": "stable-diffusion-v1-5/stable-diffusion-v1-5",
60
+ "owner": "stable-diffusion-v1-5",
61
+ "url": "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5",
62
+ "description": "",
63
+ "tags": [
64
+ "diffusers",
65
+ "safetensors",
66
+ "stable-diffusion",
67
+ "stable-diffusion-diffusers",
68
+ "text-to-image",
69
+ "arxiv:2207.12598",
70
+ "arxiv:2112.10752",
71
+ "arxiv:2103.00020",
72
+ "arxiv:2205.11487",
73
+ "arxiv:1910.09700",
74
+ "license:creativeml-openrail-m",
75
+ "autotrain_compatible",
76
+ "endpoints_compatible",
77
+ "diffusers:StableDiffusionPipeline",
78
+ "region:us"
79
+ ],
80
+ "task": "text-to-image",
81
+ "likes": 755,
82
+ "downloads": 2789469,
83
+ "updatedAt": null,
84
+ "visibility": "public",
85
+ "access": "accessible"
86
+ },
87
+ {
88
+ "type": "model",
89
+ "id": "stabilityai/stable-diffusion-xl-refiner-1.0",
90
+ "owner": "stabilityai",
91
+ "url": "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0",
92
+ "description": "",
93
+ "tags": [
94
+ "diffusers",
95
+ "safetensors",
96
+ "stable-diffusion",
97
+ "image-to-image",
98
+ "arxiv:2307.01952",
99
+ "arxiv:2211.01324",
100
+ "arxiv:2108.01073",
101
+ "arxiv:2112.10752",
102
+ "license:openrail++",
103
+ "diffusers:StableDiffusionXLImg2ImgPipeline",
104
+ "region:us"
105
+ ],
106
+ "task": "image-to-image",
107
+ "likes": 1953,
108
+ "downloads": 544617,
109
+ "updatedAt": null,
110
+ "visibility": "public",
111
+ "access": "accessible"
112
+ },
113
+ {
114
+ "type": "model",
115
+ "id": "stabilityai/stable-diffusion-3.5-large",
116
+ "owner": "stabilityai",
117
+ "url": "https://huggingface.co/stabilityai/stable-diffusion-3.5-large",
118
+ "description": "",
119
+ "tags": [
120
+ "diffusers",
121
+ "safetensors",
122
+ "text-to-image",
123
+ "stable-diffusion",
124
+ "en",
125
+ "arxiv:2403.03206",
126
+ "license:other",
127
+ "diffusers:StableDiffusion3Pipeline",
128
+ "region:us"
129
+ ],
130
+ "task": "text-to-image",
131
+ "likes": 3073,
132
+ "downloads": 82027,
133
+ "updatedAt": null,
134
+ "visibility": "public",
135
+ "access": "accessible"
136
+ },
137
+ {
138
+ "type": "model",
139
+ "id": "stabilityai/stable-video-diffusion-img2vid",
140
+ "owner": "stabilityai",
141
+ "url": "https://huggingface.co/stabilityai/stable-video-diffusion-img2vid",
142
+ "description": "",
143
+ "tags": [
144
+ "diffusers",
145
+ "safetensors",
146
+ "image-to-video",
147
+ "license:other",
148
+ "diffusers:StableVideoDiffusionPipeline",
149
+ "region:us"
150
+ ],
151
+ "task": "image-to-video",
152
+ "likes": 957,
153
+ "downloads": 46464,
154
+ "updatedAt": null,
155
+ "visibility": "public",
156
+ "access": "accessible"
157
+ }
158
+ ],
159
+ "status": 200,
160
+ "error": "",
161
+ "params": {
162
+ "search": "stable diffusion",
163
+ "pipeline_tag": "text-to-image",
164
+ "sort": "downloads",
165
+ "direction": "descending",
166
+ "limit": 5
167
+ }
168
+ }
169
+ --- STDERR ---
170
+
171
+ [OK] test_hf_models_search.py
172
+ --------------------------------------------------------------------------------
173
+
174
+ === Running test_hf_model_info.py ===
175
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_model_info.py
176
+ --- INPUT (snippet) ---
177
+ tool.forward(repo_id=repo_id)
178
+ try:
179
+ data = json.loads(result_json_str)
180
+ print(json.dumps(data, indent=2, ensure_ascii=False))
181
+ except Exception:
182
+ print(result_json_str)
183
+
184
+
185
+ if __name__ == "__main__":
186
+ main()
187
+
188
+
189
+ --- STDOUT ---
190
+ {
191
+ "item": {
192
+ "type": "model",
193
+ "id": "sentence-transformers/all-MiniLM-L6-v2",
194
+ "owner": "sentence-transformers",
195
+ "url": "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2",
196
+ "description": "",
197
+ "tags": [
198
+ "sentence-transformers",
199
+ "pytorch",
200
+ "tf",
201
+ "rust",
202
+ "onnx",
203
+ "safetensors",
204
+ "openvino",
205
+ "bert",
206
+ "feature-extraction",
207
+ "sentence-similarity",
208
+ "transformers",
209
+ "en",
210
+ "dataset:s2orc",
211
+ "dataset:flax-sentence-embeddings/stackexchange_xml",
212
+ "dataset:ms_marco",
213
+ "dataset:gooaq",
214
+ "dataset:yahoo_answers_topics",
215
+ "dataset:code_search_net",
216
+ "dataset:search_qa",
217
+ "dataset:eli5",
218
+ "dataset:snli",
219
+ "dataset:multi_nli",
220
+ "dataset:wikihow",
221
+ "dataset:natural_questions",
222
+ "dataset:trivia_qa",
223
+ "dataset:embedding-data/sentence-compression",
224
+ "dataset:embedding-data/flickr30k-captions",
225
+ "dataset:embedding-data/altlex",
226
+ "dataset:embedding-data/simple-wiki",
227
+ "dataset:embedding-data/QQP",
228
+ "dataset:embedding-data/SPECTER",
229
+ "dataset:embedding-data/PAQ_pairs",
230
+ "dataset:embedding-data/WikiAnswers",
231
+ "arxiv:1904.06472",
232
+ "arxiv:2102.07033",
233
+ "arxiv:2104.08727",
234
+ "arxiv:1704.05179",
235
+ "arxiv:1810.09305",
236
+ "license:apache-2.0",
237
+ "autotrain_compatible",
238
+ "text-embeddings-inference",
239
+ "endpoints_compatible",
240
+ "region:us"
241
+ ],
242
+ "task": "sentence-similarity",
243
+ "likes": 3788,
244
+ "downloads": 91944061,
245
+ "updatedAt": "2025-03-06T13:37:44.000Z",
246
+ "visibility": "public",
247
+ "access": "accessible",
248
+ "cardData": {
249
+ "language": "en",
250
+ "license": "apache-2.0",
251
+ "library_name": "sentence-transformers",
252
+ "tags": [
253
+ "sentence-transformers",
254
+ "feature-extraction",
255
+ "sentence-similarity",
256
+ "transformers"
257
+ ],
258
+ "datasets": [
259
+ "s2orc",
260
+ "flax-sentence-embeddings/stackexchange_xml",
261
+ "ms_marco",
262
+ "gooaq",
263
+ "yahoo_answers_topics",
264
+ "code_search_net",
265
+ "search_qa",
266
+ "eli5",
267
+ "snli",
268
+ "multi_nli",
269
+ "wikihow",
270
+ "natural_questions",
271
+ "trivia_qa",
272
+ "embedding-data/sentence-compression",
273
+ "embedding-data/flickr30k-captions",
274
+ "embedding-data/altlex",
275
+ "embedding-data/simple-wiki",
276
+ "embedding-data/QQP",
277
+ "embedding-data/SPECTER",
278
+ "embedding-data/PAQ_pairs",
279
+ "embedding-data/WikiAnswers"
280
+ ],
281
+ "pipeline_tag": "sentence-similarity"
282
+ },
283
+ "siblings": [
284
+ {
285
+ "rfilename": ".gitattributes"
286
+ },
287
+ {
288
+ "rfilename": "1_Pooling/config.json"
289
+ },
290
+ {
291
+ "rfilename": "README.md"
292
+ },
293
+ {
294
+ "rfilename": "config.json"
295
+ },
296
+ {
297
+ "rfilename": "config_sentence_transformers.json"
298
+ },
299
+ {
300
+ "rfilename": "data_config.json"
301
+ },
302
+ {
303
+ "rfilename": "model.safetensors"
304
+ },
305
+ {
306
+ "rfilename": "modules.json"
307
+ },
308
+ {
309
+ "rfilename": "onnx/model.onnx"
310
+ },
311
+ {
312
+ "rfilename": "onnx/model_O1.onnx"
313
+ },
314
+ {
315
+ "rfilename": "onnx/model_O2.onnx"
316
+ },
317
+ {
318
+ "rfilename": "onnx/model_O3.onnx"
319
+ },
320
+ {
321
+ "rfilename": "onnx/model_O4.onnx"
322
+ },
323
+ {
324
+ "rfilename": "onnx/model_qint8_arm64.onnx"
325
+ },
326
+ {
327
+ "rfilename": "onnx/model_qint8_avx512.onnx"
328
+ },
329
+ {
330
+ "rfilename": "onnx/model_qint8_avx512_vnni.onnx"
331
+ },
332
+ {
333
+ "rfilename": "onnx/model_quint8_avx2.onnx"
334
+ },
335
+ {
336
+ "rfilename": "openvino/openvino_model.bin"
337
+ },
338
+ {
339
+ "rfilename": "openvino/openvino_model.xml"
340
+ },
341
+ {
342
+ "rfilename": "openvino/openvino_model_qint8_quantized.bin"
343
+ },
344
+ {
345
+ "rfilename": "openvino/openvino_model_qint8_quantized.xml"
346
+ },
347
+ {
348
+ "rfilename": "pytorch_model.bin"
349
+ },
350
+ {
351
+ "rfilename": "rust_model.ot"
352
+ },
353
+ {
354
+ "rfilename": "sentence_bert_config.json"
355
+ },
356
+ {
357
+ "rfilename": "special_tokens_map.json"
358
+ },
359
+ {
360
+ "rfilename": "tf_model.h5"
361
+ },
362
+ {
363
+ "rfilename": "tokenizer.json"
364
+ },
365
+ {
366
+ "rfilename": "tokenizer_config.json"
367
+ },
368
+ {
369
+ "rfilename": "train_script.py"
370
+ },
371
+ {
372
+ "rfilename": "vocab.txt"
373
+ }
374
+ ]
375
+ },
376
+ "status": 200,
377
+ "error": ""
378
+ }
379
+ --- STDERR ---
380
+
381
+ [OK] test_hf_model_info.py
382
+ --------------------------------------------------------------------------------
383
+
384
+ === Running test_hf_datasets_search.py ===
385
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_datasets_search.py
386
+ --- INPUT (snippet) ---
387
+ tool.forward(
388
+ query="sentiment analysis",
389
+ tags="language:es",
390
+ sort="downloads",
391
+ direction="descending",
392
+ limit=5,
393
+ )
394
+ try:
395
+ data = json.loads(result_json_str)
396
+ print(json.dumps(data, indent=2, ensure_ascii=False))
397
+ except Exception:
398
+ print(result_json_str)
399
+
400
+
401
+ if __name__ == "__main__":
402
+ main()
403
+
404
+
405
+ --- STDOUT ---
406
+ {
407
+ "results": [
408
+ {
409
+ "type": "dataset",
410
+ "id": "Renukswamy/Patent_sentiment_analysis",
411
+ "owner": "Renukswamy",
412
+ "url": "https://huggingface.co/Renukswamy/Patent_sentiment_analysis",
413
+ "description": "",
414
+ "tags": [
415
+ "size_categories:n<1K",
416
+ "format:csv",
417
+ "modality:tabular",
418
+ "modality:text",
419
+ "library:datasets",
420
+ "library:pandas",
421
+ "library:mlcroissant",
422
+ "library:polars",
423
+ "region:us"
424
+ ],
425
+ "likes": 2,
426
+ "downloads": 12,
427
+ "updatedAt": "2021-11-26T09:18:15.000Z",
428
+ "visibility": "public",
429
+ "access": "accessible"
430
+ },
431
+ {
432
+ "type": "dataset",
433
+ "id": "maydogan/Turkish_SentimentAnalysis_TRSAv1",
434
+ "owner": "maydogan",
435
+ "url": "https://huggingface.co/maydogan/Turkish_SentimentAnalysis_TRSAv1",
436
+ "description": "",
437
+ "tags": [
438
+ "task_categories:text-classification",
439
+ "language:tr",
440
+ "size_categories:100K<n<1M",
441
+ "format:csv",
442
+ "modality:text",
443
+ "library:datasets",
444
+ "library:pandas",
445
+ "library:mlcroissant",
446
+ "library:polars",
447
+ "region:us"
448
+ ],
449
+ "likes": 6,
450
+ "downloads": 161,
451
+ "updatedAt": "2024-10-07T14:16:56.000Z",
452
+ "visibility": "public",
453
+ "access": "accessible"
454
+ },
455
+ {
456
+ "type": "dataset",
457
+ "id": "winvoker/turkish-sentiment-analysis-dataset",
458
+ "owner": "winvoker",
459
+ "url": "https://huggingface.co/winvoker/turkish-sentiment-analysis-dataset",
460
+ "description": "",
461
+ "tags": [
462
+ "task_categories:text-classification",
463
+ "task_ids:sentiment-classification",
464
+ "annotations_creators:crowdsourced",
465
+ "annotations_creators:expert-generated",
466
+ "language_creators:crowdsourced",
467
+ "multilinguality:monolingual",
468
+ "language:tr",
469
+ "license:cc-by-sa-4.0",
470
+ "size_categories:100K<n<1M",
471
+ "format:csv",
472
+ "modality:text",
473
+ "library:datasets",
474
+ "library:pandas",
475
+ "library:mlcroissant",
476
+ "library:polars",
477
+ "region:us"
478
+ ],
479
+ "likes": 47,
480
+ "downloads": 459,
481
+ "updatedAt": "2023-07-19T13:15:13.000Z",
482
+ "visibility": "public",
483
+ "access": "accessible"
484
+ },
485
+ {
486
+ "type": "dataset",
487
+ "id": "ramnika003/autotrain-data-sentiment_analysis_project",
488
+ "owner": "ramnika003",
489
+ "url": "https://huggingface.co/ramnika003/autotrain-data-sentiment_analysis_project",
490
+ "description": "",
491
+ "tags": [
492
+ "task_categories:text-classification",
493
+ "region:us"
494
+ ],
495
+ "likes": 0,
496
+ "downloads": 160,
497
+ "updatedAt": "2022-04-05T09:16:59.000Z",
498
+ "visibility": "public",
499
+ "access": "accessible"
500
+ },
501
+ {
502
+ "type": "dataset",
503
+ "id": "elmurod1202/uzbek-sentiment-analysis",
504
+ "owner": "elmurod1202",
505
+ "url": "https://huggingface.co/elmurod1202/uzbek-sentiment-analysis",
506
+ "description": "",
507
+ "tags": [
508
+ "size_categories:10K<n<100K",
509
+ "format:text",
510
+ "modality:image",
511
+ "modality:text",
512
+ "library:datasets",
513
+ "library:mlcroissant",
514
+ "region:us"
515
+ ],
516
+ "likes": 3,
517
+ "downloads": 236,
518
+ "updatedAt": "2022-05-11T13:43:59.000Z",
519
+ "visibility": "public",
520
+ "access": "accessible"
521
+ }
522
+ ],
523
+ "status": 200,
524
+ "error": "",
525
+ "params": {
526
+ "search": "sentiment analysis",
527
+ "tags": "language:es",
528
+ "sort": "downloads",
529
+ "direction": "descending",
530
+ "limit": 5
531
+ }
532
+ }
533
+ --- STDERR ---
534
+
535
+ [OK] test_hf_datasets_search.py
536
+ --------------------------------------------------------------------------------
537
+
538
+ === Running test_hf_dataset_info.py ===
539
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_dataset_info.py
540
+ --- INPUT (snippet) ---
541
+ tool.forward(repo_id=repo_id)
542
+ try:
543
+ data = json.loads(result_json_str)
544
+ print(json.dumps(data, indent=2, ensure_ascii=False))
545
+ except Exception:
546
+ print(result_json_str)
547
+
548
+
549
+ if __name__ == "__main__":
550
+ main()
551
+
552
+
553
+ --- STDOUT ---
554
+ {
555
+ "item": {
556
+ "type": "dataset",
557
+ "id": "nyu-mll/glue",
558
+ "owner": "nyu-mll",
559
+ "url": "https://huggingface.co/nyu-mll/glue",
560
+ "description": "",
561
+ "tags": [
562
+ "task_categories:text-classification",
563
+ "task_ids:acceptability-classification",
564
+ "task_ids:natural-language-inference",
565
+ "task_ids:semantic-similarity-scoring",
566
+ "task_ids:sentiment-classification",
567
+ "task_ids:text-scoring",
568
+ "annotations_creators:other",
569
+ "language_creators:other",
570
+ "multilinguality:monolingual",
571
+ "source_datasets:original",
572
+ "language:en",
573
+ "license:other",
574
+ "size_categories:1M<n<10M",
575
+ "format:parquet",
576
+ "modality:tabular",
577
+ "modality:text",
578
+ "library:datasets",
579
+ "library:pandas",
580
+ "library:mlcroissant",
581
+ "library:polars",
582
+ "arxiv:1804.07461",
583
+ "region:us",
584
+ "qa-nli",
585
+ "coreference-nli",
586
+ "paraphrase-identification"
587
+ ],
588
+ "likes": 435,
589
+ "downloads": 313025,
590
+ "updatedAt": "2024-01-30T07:41:18.000Z",
591
+ "visibility": "public",
592
+ "access": "accessible",
593
+ "cardData": {
594
+ "annotations_creators": [
595
+ "other"
596
+ ],
597
+ "language_creators": [
598
+ "other"
599
+ ],
600
+ "language": [
601
+ "en"
602
+ ],
603
+ "license": [
604
+ "other"
605
+ ],
606
+ "multilinguality": [
607
+ "monolingual"
608
+ ],
609
+ "size_categories": [
610
+ "10K<n<100K"
611
+ ],
612
+ "source_datasets": [
613
+ "original"
614
+ ],
615
+ "task_categories": [
616
+ "text-classification"
617
+ ],
618
+ "task_ids": [
619
+ "acceptability-classification",
620
+ "natural-language-inference",
621
+ "semantic-similarity-scoring",
622
+ "sentiment-classification",
623
+ "text-scoring"
624
+ ],
625
+ "paperswithcode_id": "glue",
626
+ "pretty_name": "GLUE (General Language Understanding Evaluation benchmark)",
627
+ "config_names": [
628
+ "ax",
629
+ "cola",
630
+ "mnli",
631
+ "mnli_matched",
632
+ "mnli_mismatched",
633
+ "mrpc",
634
+ "qnli",
635
+ "qqp",
636
+ "rte",
637
+ "sst2",
638
+ "stsb",
639
+ "wnli"
640
+ ],
641
+ "tags": [
642
+ "qa-nli",
643
+ "coreference-nli",
644
+ "paraphrase-identification"
645
+ ],
646
+ "dataset_info": [
647
+ {
648
+ "config_name": "ax",
649
+ "features": [
650
+ {
651
+ "name": "premise",
652
+ "dtype": "string"
653
+ },
654
+ {
655
+ "name": "hypothesis",
656
+ "dtype": "string"
657
+ },
658
+ {
659
+ "name": "label",
660
+ "dtype": {
661
+ "class_label": {
662
+ "names": {
663
+ "0": "entailment",
664
+ "1": "neutral",
665
+ "2": "contradiction"
666
+ }
667
+ }
668
+ }
669
+ },
670
+ {
671
+ "name": "idx",
672
+ "dtype": "int32"
673
+ }
674
+ ],
675
+ "splits": [
676
+ {
677
+ "name": "test",
678
+ "num_bytes": 237694,
679
+ "num_examples": 1104
680
+ }
681
+ ],
682
+ "download_size": 80767,
683
+ "dataset_size": 237694
684
+ },
685
+ {
686
+ "config_name": "cola",
687
+ "features": [
688
+ {
689
+ "name": "sentence",
690
+ "dtype": "string"
691
+ },
692
+ {
693
+ "name": "label",
694
+ "dtype": {
695
+ "class_label": {
696
+ "names": {
697
+ "0": "unacceptable",
698
+ "1": "acceptable"
699
+ }
700
+ }
701
+ }
702
+ },
703
+ {
704
+ "name": "idx",
705
+ "dtype": "int32"
706
+ }
707
+ ],
708
+ "splits": [
709
+ {
710
+ "name": "train",
711
+ "num_bytes": 484869,
712
+ "num_examples": 8551
713
+ },
714
+ {
715
+ "name": "validation",
716
+ "num_bytes": 60322,
717
+ "num_examples": 1043
718
+ },
719
+ {
720
+ "name": "test",
721
+ "num_bytes": 60513,
722
+ "num_examples": 1063
723
+ }
724
+ ],
725
+ "download_size": 326394,
726
+ "dataset_size": 605704
727
+ },
728
+ {
729
+ "config_name": "mnli",
730
+ "features": [
731
+ {
732
+ "name": "premise",
733
+ "dtype": "string"
734
+ },
735
+ {
736
+ "name": "hypothesis",
737
+ "dtype": "string"
738
+ },
739
+ {
740
+ "name": "label",
741
+ "dtype": {
742
+ "class_label": {
743
+ "names": {
744
+ "0": "entailment",
745
+ "1": "neutral",
746
+ "2": "contradiction"
747
+ }
748
+ }
749
+ }
750
+ },
751
+ {
752
+ "name": "idx",
753
+ "dtype": "int32"
754
+ }
755
+ ],
756
+ "splits": [
757
+ {
758
+ "name": "train",
759
+ "num_bytes": 74619646,
760
+ "num_examples": 392702
761
+ },
762
+ {
763
+ "name": "validation_matched",
764
+ "num_bytes": 1833783,
765
+ "num_examples": 9815
766
+ },
767
+ {
768
+ "name": "validation_mismatched",
769
+ "num_bytes": 1949231,
770
+ "num_examples": 9832
771
+ },
772
+ {
773
+ "name": "test_matched",
774
+ "num_bytes": 1848654,
775
+ "num_examples": 9796
776
+ },
777
+ {
778
+ "name": "test_mismatched",
779
+ "num_bytes": 1950703,
780
+ "num_examples": 9847
781
+ }
782
+ ],
783
+ "download_size": 57168425,
784
+ "dataset_size": 82202017
785
+ },
786
+ {
787
+ "config_name": "mnli_matched",
788
+ "features": [
789
+ {
790
+ "name": "premise",
791
+ "dtype": "string"
792
+ },
793
+ {
794
+ "name": "hypothesis",
795
+ "dtype": "string"
796
+ },
797
+ {
798
+ "name": "label",
799
+ "dtype": {
800
+ "class_label": {
801
+ "names": {
802
+ "0": "entailment",
803
+ "1": "neutral",
804
+ "2": "contradiction"
805
+ }
806
+ }
807
+ }
808
+ },
809
+ {
810
+ "name": "idx",
811
+ "dtype": "int32"
812
+ }
813
+ ],
814
+ "splits": [
815
+ {
816
+ "name": "validation",
817
+ "num_bytes": 1833783,
818
+ "num_examples": 9815
819
+ },
820
+ {
821
+ "name": "test",
822
+ "num_bytes": 1848654,
823
+ "num_examples": 9796
824
+ }
825
+ ],
826
+ "download_size": 2435055,
827
+ "dataset_size": 3682437
828
+ },
829
+ {
830
+ "config_name": "mnli_mismatched",
831
+ "features": [
832
+ {
833
+ "name": "premise",
834
+ "dtype": "string"
835
+ },
836
+ {
837
+ "name": "hypothesis",
838
+ "dtype": "string"
839
+ },
840
+ {
841
+ "name": "label",
842
+ "dtype": {
843
+ "class_label": {
844
+ "names": {
845
+ "0": "entailment",
846
+ "1": "neutral",
847
+ "2": "contradiction"
848
+ }
849
+ }
850
+ }
851
+ },
852
+ {
853
+ "name": "idx",
854
+ "dtype": "int32"
855
+ }
856
+ ],
857
+ "splits": [
858
+ {
859
+ "name": "validation",
860
+ "num_bytes": 1949231,
861
+ "num_examples": 9832
862
+ },
863
+ {
864
+ "name": "test",
865
+ "num_bytes": 1950703,
866
+ "num_examples": 9847
867
+ }
868
+ ],
869
+ "download_size": 2509009,
870
+ "dataset_size": 3899934
871
+ },
872
+ {
873
+ "config_name": "mrpc",
874
+ "features": [
875
+ {
876
+ "name": "sentence1",
877
+ "dtype": "string"
878
+ },
879
+ {
880
+ "name": "sentence2",
881
+ "dtype": "string"
882
+ },
883
+ {
884
+ "name": "label",
885
+ "dtype": {
886
+ "class_label": {
887
+ "names": {
888
+ "0": "not_equivalent",
889
+ "1": "equivalent"
890
+ }
891
+ }
892
+ }
893
+ },
894
+ {
895
+ "name": "idx",
896
+ "dtype": "int32"
897
+ }
898
+ ],
899
+ "splits": [
900
+ {
901
+ "name": "train",
902
+ "num_bytes": 943843,
903
+ "num_examples": 3668
904
+ },
905
+ {
906
+ "name": "validation",
907
+ "num_bytes": 105879,
908
+ "num_examples": 408
909
+ },
910
+ {
911
+ "name": "test",
912
+ "num_bytes": 442410,
913
+ "num_examples": 1725
914
+ }
915
+ ],
916
+ "download_size": 1033400,
917
+ "dataset_size": 1492132
918
+ },
919
+ {
920
+ "config_name": "qnli",
921
+ "features": [
922
+ {
923
+ "name": "question",
924
+ "dtype": "string"
925
+ },
926
+ {
927
+ "name": "sentence",
928
+ "dtype": "string"
929
+ },
930
+ {
931
+ "name": "label",
932
+ "dtype": {
933
+ "class_label": {
934
+ "names": {
935
+ "0": "entailment",
936
+ "1": "not_entailment"
937
+ }
938
+ }
939
+ }
940
+ },
941
+ {
942
+ "name": "idx",
943
+ "dtype": "int32"
944
+ }
945
+ ],
946
+ "splits": [
947
+ {
948
+ "name": "train",
949
+ "num_bytes": 25612443,
950
+ "num_examples": 104743
951
+ },
952
+ {
953
+ "name": "validation",
954
+ "num_bytes": 1368304,
955
+ "num_examples": 5463
956
+ },
957
+ {
958
+ "name": "test",
959
+ "num_bytes": 1373093,
960
+ "num_examples": 5463
961
+ }
962
+ ],
963
+ "download_size": 19278324,
964
+ "dataset_size": 28353840
965
+ },
966
+ {
967
+ "config_name": "qqp",
968
+ "features": [
969
+ {
970
+ "name": "question1",
971
+ "dtype": "string"
972
+ },
973
+ {
974
+ "name": "question2",
975
+ "dtype": "string"
976
+ },
977
+ {
978
+ "name": "label",
979
+ "dtype": {
980
+ "class_label": {
981
+ "names": {
982
+ "0": "not_duplicate",
983
+ "1": "duplicate"
984
+ }
985
+ }
986
+ }
987
+ },
988
+ {
989
+ "name": "idx",
990
+ "dtype": "int32"
991
+ }
992
+ ],
993
+ "splits": [
994
+ {
995
+ "name": "train",
996
+ "num_bytes": 50900820,
997
+ "num_examples": 363846
998
+ },
999
+ {
1000
+ "name": "validation",
1001
+ "num_bytes": 5653754,
1002
+ "num_examples": 40430
1003
+ },
1004
+ {
1005
+ "name": "test",
1006
+ "num_bytes": 55171111,
1007
+ "num_examples": 390965
1008
+ }
1009
+ ],
1010
+ "download_size": 73982265,
1011
+ "dataset_size": 111725685
1012
+ },
1013
+ {
1014
+ "config_name": "rte",
1015
+ "features": [
1016
+ {
1017
+ "name": "sentence1",
1018
+ "dtype": "string"
1019
+ },
1020
+ {
1021
+ "name": "sentence2",
1022
+ "dtype": "string"
1023
+ },
1024
+ {
1025
+ "name": "label",
1026
+ "dtype": {
1027
+ "class_label": {
1028
+ "names": {
1029
+ "0": "entailment",
1030
+ "1": "not_entailment"
1031
+ }
1032
+ }
1033
+ }
1034
+ },
1035
+ {
1036
+ "name": "idx",
1037
+ "dtype": "int32"
1038
+ }
1039
+ ],
1040
+ "splits": [
1041
+ {
1042
+ "name": "train",
1043
+ "num_bytes": 847320,
1044
+ "num_examples": 2490
1045
+ },
1046
+ {
1047
+ "name": "validation",
1048
+ "num_bytes": 90728,
1049
+ "num_examples": 277
1050
+ },
1051
+ {
1052
+ "name": "test",
1053
+ "num_bytes": 974053,
1054
+ "num_examples": 3000
1055
+ }
1056
+ ],
1057
+ "download_size": 1274409,
1058
+ "dataset_size": 1912101
1059
+ },
1060
+ {
1061
+ "config_name": "sst2",
1062
+ "features": [
1063
+ {
1064
+ "name": "sentence",
1065
+ "dtype": "string"
1066
+ },
1067
+ {
1068
+ "name": "label",
1069
+ "dtype": {
1070
+ "class_label": {
1071
+ "names": {
1072
+ "0": "negative",
1073
+ "1": "positive"
1074
+ }
1075
+ }
1076
+ }
1077
+ },
1078
+ {
1079
+ "name": "idx",
1080
+ "dtype": "int32"
1081
+ }
1082
+ ],
1083
+ "splits": [
1084
+ {
1085
+ "name": "train",
1086
+ "num_bytes": 4681603,
1087
+ "num_examples": 67349
1088
+ },
1089
+ {
1090
+ "name": "validation",
1091
+ "num_bytes": 106252,
1092
+ "num_examples": 872
1093
+ },
1094
+ {
1095
+ "name": "test",
1096
+ "num_bytes": 216640,
1097
+ "num_examples": 1821
1098
+ }
1099
+ ],
1100
+ "download_size": 3331080,
1101
+ "dataset_size": 5004495
1102
+ },
1103
+ {
1104
+ "config_name": "stsb",
1105
+ "features": [
1106
+ {
1107
+ "name": "sentence1",
1108
+ "dtype": "string"
1109
+ },
1110
+ {
1111
+ "name": "sentence2",
1112
+ "dtype": "string"
1113
+ },
1114
+ {
1115
+ "name": "label",
1116
+ "dtype": "float32"
1117
+ },
1118
+ {
1119
+ "name": "idx",
1120
+ "dtype": "int32"
1121
+ }
1122
+ ],
1123
+ "splits": [
1124
+ {
1125
+ "name": "train",
1126
+ "num_bytes": 754791,
1127
+ "num_examples": 5749
1128
+ },
1129
+ {
1130
+ "name": "validation",
1131
+ "num_bytes": 216064,
1132
+ "num_examples": 1500
1133
+ },
1134
+ {
1135
+ "name": "test",
1136
+ "num_bytes": 169974,
1137
+ "num_examples": 1379
1138
+ }
1139
+ ],
1140
+ "download_size": 766983,
1141
+ "dataset_size": 1140829
1142
+ },
1143
+ {
1144
+ "config_name": "wnli",
1145
+ "features": [
1146
+ {
1147
+ "name": "sentence1",
1148
+ "dtype": "string"
1149
+ },
1150
+ {
1151
+ "name": "sentence2",
1152
+ "dtype": "string"
1153
+ },
1154
+ {
1155
+ "name": "label",
1156
+ "dtype": {
1157
+ "class_label": {
1158
+ "names": {
1159
+ "0": "not_entailment",
1160
+ "1": "entailment"
1161
+ }
1162
+ }
1163
+ }
1164
+ },
1165
+ {
1166
+ "name": "idx",
1167
+ "dtype": "int32"
1168
+ }
1169
+ ],
1170
+ "splits": [
1171
+ {
1172
+ "name": "train",
1173
+ "num_bytes": 107109,
1174
+ "num_examples": 635
1175
+ },
1176
+ {
1177
+ "name": "validation",
1178
+ "num_bytes": 12162,
1179
+ "num_examples": 71
1180
+ },
1181
+ {
1182
+ "name": "test",
1183
+ "num_bytes": 37889,
1184
+ "num_examples": 146
1185
+ }
1186
+ ],
1187
+ "download_size": 63522,
1188
+ "dataset_size": 157160
1189
+ }
1190
+ ],
1191
+ "configs": [
1192
+ {
1193
+ "config_name": "ax",
1194
+ "data_files": [
1195
+ {
1196
+ "split": "test",
1197
+ "path": "ax/test-*"
1198
+ }
1199
+ ]
1200
+ },
1201
+ {
1202
+ "config_name": "cola",
1203
+ "data_files": [
1204
+ {
1205
+ "split": "train",
1206
+ "path": "cola/train-*"
1207
+ },
1208
+ {
1209
+ "split": "validation",
1210
+ "path": "cola/validation-*"
1211
+ },
1212
+ {
1213
+ "split": "test",
1214
+ "path": "cola/test-*"
1215
+ }
1216
+ ]
1217
+ },
1218
+ {
1219
+ "config_name": "mnli",
1220
+ "data_files": [
1221
+ {
1222
+ "split": "train",
1223
+ "path": "mnli/train-*"
1224
+ },
1225
+ {
1226
+ "split": "validation_matched",
1227
+ "path": "mnli/validation_matched-*"
1228
+ },
1229
+ {
1230
+ "split": "validation_mismatched",
1231
+ "path": "mnli/validation_mismatched-*"
1232
+ },
1233
+ {
1234
+ "split": "test_matched",
1235
+ "path": "mnli/test_matched-*"
1236
+ },
1237
+ {
1238
+ "split": "test_mismatched",
1239
+ "path": "mnli/test_mismatched-*"
1240
+ }
1241
+ ]
1242
+ },
1243
+ {
1244
+ "config_name": "mnli_matched",
1245
+ "data_files": [
1246
+ {
1247
+ "split": "validation",
1248
+ "path": "mnli_matched/validation-*"
1249
+ },
1250
+ {
1251
+ "split": "test",
1252
+ "path": "mnli_matched/test-*"
1253
+ }
1254
+ ]
1255
+ },
1256
+ {
1257
+ "config_name": "mnli_mismatched",
1258
+ "data_files": [
1259
+ {
1260
+ "split": "validation",
1261
+ "path": "mnli_mismatched/validation-*"
1262
+ },
1263
+ {
1264
+ "split": "test",
1265
+ "path": "mnli_mismatched/test-*"
1266
+ }
1267
+ ]
1268
+ },
1269
+ {
1270
+ "config_name": "mrpc",
1271
+ "data_files": [
1272
+ {
1273
+ "split": "train",
1274
+ "path": "mrpc/train-*"
1275
+ },
1276
+ {
1277
+ "split": "validation",
1278
+ "path": "mrpc/validation-*"
1279
+ },
1280
+ {
1281
+ "split": "test",
1282
+ "path": "mrpc/test-*"
1283
+ }
1284
+ ]
1285
+ },
1286
+ {
1287
+ "config_name": "qnli",
1288
+ "data_files": [
1289
+ {
1290
+ "split": "train",
1291
+ "path": "qnli/train-*"
1292
+ },
1293
+ {
1294
+ "split": "validation",
1295
+ "path": "qnli/validation-*"
1296
+ },
1297
+ {
1298
+ "split": "test",
1299
+ "path": "qnli/test-*"
1300
+ }
1301
+ ]
1302
+ },
1303
+ {
1304
+ "config_name": "qqp",
1305
+ "data_files": [
1306
+ {
1307
+ "split": "train",
1308
+ "path": "qqp/train-*"
1309
+ },
1310
+ {
1311
+ "split": "validation",
1312
+ "path": "qqp/validation-*"
1313
+ },
1314
+ {
1315
+ "split": "test",
1316
+ "path": "qqp/test-*"
1317
+ }
1318
+ ]
1319
+ },
1320
+ {
1321
+ "config_name": "rte",
1322
+ "data_files": [
1323
+ {
1324
+ "split": "train",
1325
+ "path": "rte/train-*"
1326
+ },
1327
+ {
1328
+ "split": "validation",
1329
+ "path": "rte/validation-*"
1330
+ },
1331
+ {
1332
+ "split": "test",
1333
+ "path": "rte/test-*"
1334
+ }
1335
+ ]
1336
+ },
1337
+ {
1338
+ "config_name": "sst2",
1339
+ "data_files": [
1340
+ {
1341
+ "split": "train",
1342
+ "path": "sst2/train-*"
1343
+ },
1344
+ {
1345
+ "split": "validation",
1346
+ "path": "sst2/validation-*"
1347
+ },
1348
+ {
1349
+ "split": "test",
1350
+ "path": "sst2/test-*"
1351
+ }
1352
+ ]
1353
+ },
1354
+ {
1355
+ "config_name": "stsb",
1356
+ "data_files": [
1357
+ {
1358
+ "split": "train",
1359
+ "path": "stsb/train-*"
1360
+ },
1361
+ {
1362
+ "split": "validation",
1363
+ "path": "stsb/validation-*"
1364
+ },
1365
+ {
1366
+ "split": "test",
1367
+ "path": "stsb/test-*"
1368
+ }
1369
+ ]
1370
+ },
1371
+ {
1372
+ "config_name": "wnli",
1373
+ "data_files": [
1374
+ {
1375
+ "split": "train",
1376
+ "path": "wnli/train-*"
1377
+ },
1378
+ {
1379
+ "split": "validation",
1380
+ "path": "wnli/validation-*"
1381
+ },
1382
+ {
1383
+ "split": "test",
1384
+ "path": "wnli/test-*"
1385
+ }
1386
+ ]
1387
+ }
1388
+ ],
1389
+ "train-eval-index": [
1390
+ {
1391
+ "config": "cola",
1392
+ "task": "text-classification",
1393
+ "task_id": "binary_classification",
1394
+ "splits": {
1395
+ "train_split": "train",
1396
+ "eval_split": "validation"
1397
+ },
1398
+ "col_mapping": {
1399
+ "sentence": "text",
1400
+ "label": "target"
1401
+ }
1402
+ },
1403
+ {
1404
+ "config": "sst2",
1405
+ "task": "text-classification",
1406
+ "task_id": "binary_classification",
1407
+ "splits": {
1408
+ "train_split": "train",
1409
+ "eval_split": "validation"
1410
+ },
1411
+ "col_mapping": {
1412
+ "sentence": "text",
1413
+ "label": "target"
1414
+ }
1415
+ },
1416
+ {
1417
+ "config": "mrpc",
1418
+ "task": "text-classification",
1419
+ "task_id": "natural_language_inference",
1420
+ "splits": {
1421
+ "train_split": "train",
1422
+ "eval_split": "validation"
1423
+ },
1424
+ "col_mapping": {
1425
+ "sentence1": "text1",
1426
+ "sentence2": "text2",
1427
+ "label": "target"
1428
+ }
1429
+ },
1430
+ {
1431
+ "config": "qqp",
1432
+ "task": "text-classification",
1433
+ "task_id": "natural_language_inference",
1434
+ "splits": {
1435
+ "train_split": "train",
1436
+ "eval_split": "validation"
1437
+ },
1438
+ "col_mapping": {
1439
+ "question1": "text1",
1440
+ "question2": "text2",
1441
+ "label": "target"
1442
+ }
1443
+ },
1444
+ {
1445
+ "config": "stsb",
1446
+ "task": "text-classification",
1447
+ "task_id": "natural_language_inference",
1448
+ "splits": {
1449
+ "train_split": "train",
1450
+ "eval_split": "validation"
1451
+ },
1452
+ "col_mapping": {
1453
+ "sentence1": "text1",
1454
+ "sentence2": "text2",
1455
+ "label": "target"
1456
+ }
1457
+ },
1458
+ {
1459
+ "config": "mnli",
1460
+ "task": "text-classification",
1461
+ "task_id": "natural_language_inference",
1462
+ "splits": {
1463
+ "train_split": "train",
1464
+ "eval_split": "validation_matched"
1465
+ },
1466
+ "col_mapping": {
1467
+ "premise": "text1",
1468
+ "hypothesis": "text2",
1469
+ "label": "target"
1470
+ }
1471
+ },
1472
+ {
1473
+ "config": "mnli_mismatched",
1474
+ "task": "text-classification",
1475
+ "task_id": "natural_language_inference",
1476
+ "splits": {
1477
+ "train_split": "train",
1478
+ "eval_split": "validation"
1479
+ },
1480
+ "col_mapping": {
1481
+ "premise": "text1",
1482
+ "hypothesis": "text2",
1483
+ "label": "target"
1484
+ }
1485
+ },
1486
+ {
1487
+ "config": "mnli_matched",
1488
+ "task": "text-classification",
1489
+ "task_id": "natural_language_inference",
1490
+ "splits": {
1491
+ "train_split": "train",
1492
+ "eval_split": "validation"
1493
+ },
1494
+ "col_mapping": {
1495
+ "premise": "text1",
1496
+ "hypothesis": "text2",
1497
+ "label": "target"
1498
+ }
1499
+ },
1500
+ {
1501
+ "config": "qnli",
1502
+ "task": "text-classification",
1503
+ "task_id": "natural_language_inference",
1504
+ "splits": {
1505
+ "train_split": "train",
1506
+ "eval_split": "validation"
1507
+ },
1508
+ "col_mapping": {
1509
+ "question": "text1",
1510
+ "sentence": "text2",
1511
+ "label": "target"
1512
+ }
1513
+ },
1514
+ {
1515
+ "config": "rte",
1516
+ "task": "text-classification",
1517
+ "task_id": "natural_language_inference",
1518
+ "splits": {
1519
+ "train_split": "train",
1520
+ "eval_split": "validation"
1521
+ },
1522
+ "col_mapping": {
1523
+ "sentence1": "text1",
1524
+ "sentence2": "text2",
1525
+ "label": "target"
1526
+ }
1527
+ },
1528
+ {
1529
+ "config": "wnli",
1530
+ "task": "text-classification",
1531
+ "task_id": "natural_language_inference",
1532
+ "splits": {
1533
+ "train_split": "train",
1534
+ "eval_split": "validation"
1535
+ },
1536
+ "col_mapping": {
1537
+ "sentence1": "text1",
1538
+ "sentence2": "text2",
1539
+ "label": "target"
1540
+ }
1541
+ }
1542
+ ]
1543
+ },
1544
+ "siblings": [
1545
+ {
1546
+ "rfilename": ".gitattributes"
1547
+ },
1548
+ {
1549
+ "rfilename": "README.md"
1550
+ },
1551
+ {
1552
+ "rfilename": "ax/test-00000-of-00001.parquet"
1553
+ },
1554
+ {
1555
+ "rfilename": "cola/test-00000-of-00001.parquet"
1556
+ },
1557
+ {
1558
+ "rfilename": "cola/train-00000-of-00001.parquet"
1559
+ },
1560
+ {
1561
+ "rfilename": "cola/validation-00000-of-00001.parquet"
1562
+ },
1563
+ {
1564
+ "rfilename": "mnli/test_matched-00000-of-00001.parquet"
1565
+ },
1566
+ {
1567
+ "rfilename": "mnli/test_mismatched-00000-of-00001.parquet"
1568
+ },
1569
+ {
1570
+ "rfilename": "mnli/train-00000-of-00001.parquet"
1571
+ },
1572
+ {
1573
+ "rfilename": "mnli/validation_matched-00000-of-00001.parquet"
1574
+ },
1575
+ {
1576
+ "rfilename": "mnli/validation_mismatched-00000-of-00001.parquet"
1577
+ },
1578
+ {
1579
+ "rfilename": "mnli_matched/test-00000-of-00001.parquet"
1580
+ },
1581
+ {
1582
+ "rfilename": "mnli_matched/validation-00000-of-00001.parquet"
1583
+ },
1584
+ {
1585
+ "rfilename": "mnli_mismatched/test-00000-of-00001.parquet"
1586
+ },
1587
+ {
1588
+ "rfilename": "mnli_mismatched/validation-00000-of-00001.parquet"
1589
+ },
1590
+ {
1591
+ "rfilename": "mrpc/test-00000-of-00001.parquet"
1592
+ },
1593
+ {
1594
+ "rfilename": "mrpc/train-00000-of-00001.parquet"
1595
+ },
1596
+ {
1597
+ "rfilename": "mrpc/validation-00000-of-00001.parquet"
1598
+ },
1599
+ {
1600
+ "rfilename": "qnli/test-00000-of-00001.parquet"
1601
+ },
1602
+ {
1603
+ "rfilename": "qnli/train-00000-of-00001.parquet"
1604
+ },
1605
+ {
1606
+ "rfilename": "qnli/validation-00000-of-00001.parquet"
1607
+ },
1608
+ {
1609
+ "rfilename": "qqp/test-00000-of-00001.parquet"
1610
+ },
1611
+ {
1612
+ "rfilename": "qqp/train-00000-of-00001.parquet"
1613
+ },
1614
+ {
1615
+ "rfilename": "qqp/validation-00000-of-00001.parquet"
1616
+ },
1617
+ {
1618
+ "rfilename": "rte/test-00000-of-00001.parquet"
1619
+ },
1620
+ {
1621
+ "rfilename": "rte/train-00000-of-00001.parquet"
1622
+ },
1623
+ {
1624
+ "rfilename": "rte/validation-00000-of-00001.parquet"
1625
+ },
1626
+ {
1627
+ "rfilename": "sst2/test-00000-of-00001.parquet"
1628
+ },
1629
+ {
1630
+ "rfilename": "sst2/train-00000-of-00001.parquet"
1631
+ },
1632
+ {
1633
+ "rfilename": "sst2/validation-00000-of-00001.parquet"
1634
+ },
1635
+ {
1636
+ "rfilename": "stsb/test-00000-of-00001.parquet"
1637
+ },
1638
+ {
1639
+ "rfilename": "stsb/train-00000-of-00001.parquet"
1640
+ },
1641
+ {
1642
+ "rfilename": "stsb/validation-00000-of-00001.parquet"
1643
+ },
1644
+ {
1645
+ "rfilename": "wnli/test-00000-of-00001.parquet"
1646
+ },
1647
+ {
1648
+ "rfilename": "wnli/train-00000-of-00001.parquet"
1649
+ },
1650
+ {
1651
+ "rfilename": "wnli/validation-00000-of-00001.parquet"
1652
+ }
1653
+ ]
1654
+ },
1655
+ "status": 200,
1656
+ "error": ""
1657
+ }
1658
+ --- STDERR ---
1659
+
1660
+ [OK] test_hf_dataset_info.py
1661
+ --------------------------------------------------------------------------------
1662
+
1663
+ === Running test_hf_spaces_search.py ===
1664
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_spaces_search.py
1665
+ --- INPUT (snippet) ---
1666
+ tool.forward(
1667
+ query="document Q&A",
1668
+ sort="likes",
1669
+ direction="descending",
1670
+ limit=5,
1671
+ )
1672
+ try:
1673
+ data = json.loads(result_json_str)
1674
+ print(json.dumps(data, indent=2, ensure_ascii=False))
1675
+ except Exception:
1676
+ print(result_json_str)
1677
+
1678
+
1679
+ if __name__ == "__main__":
1680
+ main()
1681
+
1682
+
1683
+ --- STDOUT ---
1684
+ {
1685
+ "results": [],
1686
+ "status": 200,
1687
+ "error": "",
1688
+ "params": {
1689
+ "search": "document Q&A",
1690
+ "sort": "likes",
1691
+ "direction": "descending",
1692
+ "limit": 5
1693
+ }
1694
+ }
1695
+ --- STDERR ---
1696
+
1697
+ [OK] test_hf_spaces_search.py
1698
+ --------------------------------------------------------------------------------
1699
+
1700
+ === Running test_hf_space_info.py ===
1701
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_space_info.py
1702
+ --- INPUT (snippet) ---
1703
+ tool.forward(repo_id=repo_id)
1704
+ try:
1705
+ data = json.loads(result_json_str)
1706
+ print(json.dumps(data, indent=2, ensure_ascii=False))
1707
+ except Exception:
1708
+ # Fallback: encode to UTF-8 to avoid Windows cp1252 issues
1709
+ try:
1710
+ print(result_json_str.encode("utf-8", errors="replace").decode("utf-8", errors="replace"))
1711
+ except Exception:
1712
+ print("<unprintable> due to encoding error")
1713
+
1714
+
1715
+ if __name__ == "__main__":
1716
+ main()
1717
+
1718
+
1719
+ --- STDOUT ---
1720
+ <unprintable> due to encoding error
1721
+ --- STDERR ---
1722
+
1723
+ [OK] test_hf_space_info.py
1724
+ --------------------------------------------------------------------------------
1725
+
1726
+ === Running test_hf_user_info.py ===
1727
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_user_info.py
1728
+ --- INPUT (snippet) ---
1729
+ tool.forward(username=username)
1730
+ try:
1731
+ data = json.loads(result_json_str)
1732
+ print(json.dumps(data, indent=2, ensure_ascii=False))
1733
+ except Exception:
1734
+ print(result_json_str)
1735
+
1736
+
1737
+ if __name__ == "__main__":
1738
+ main()
1739
+
1740
+
1741
+ --- STDOUT ---
1742
+ {
1743
+ "item": {},
1744
+ "status": 404,
1745
+ "error": "http_404",
1746
+ "visibility": "public",
1747
+ "access": "no_access"
1748
+ }
1749
+ --- STDERR ---
1750
+
1751
+ [OK] test_hf_user_info.py
1752
+ --------------------------------------------------------------------------------
1753
+
1754
+ === Running test_hf_collections_list.py ===
1755
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_collections_list.py
1756
+ --- INPUT (snippet) ---
1757
+ tool.forward(owner=None) # or set an owner like "huggingface"
1758
+ try:
1759
+ data = json.loads(result_json_str)
1760
+ print(json.dumps(data, indent=2, ensure_ascii=False))
1761
+ except Exception:
1762
+ print(result_json_str)
1763
+
1764
+
1765
+ if __name__ == "__main__":
1766
+ main()
1767
+
1768
+
1769
+ --- STDOUT ---
1770
+ {
1771
+ "results": [
1772
+ {
1773
+ "type": "collection",
1774
+ "id": "deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f",
1775
+ "owner": "deepseek-ai",
1776
+ "title": "DeepSeek-V3.1",
1777
+ "url": "https://huggingface.co/collections/deepseek-ai/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f",
1778
+ "visibility": "public",
1779
+ "access": "accessible"
1780
+ },
1781
+ {
1782
+ "type": "collection",
1783
+ "id": "facebook/dinov3-68924841bd6b561778e31009",
1784
+ "owner": "facebook",
1785
+ "title": "DINOv3",
1786
+ "url": "https://huggingface.co/collections/facebook/facebook/dinov3-68924841bd6b561778e31009",
1787
+ "visibility": "public",
1788
+ "access": "accessible"
1789
+ },
1790
+ {
1791
+ "type": "collection",
1792
+ "id": "AIDC-AI/ovis25-689ec1474633b2aab8809335",
1793
+ "owner": "AIDC-AI",
1794
+ "title": "Ovis2.5",
1795
+ "url": "https://huggingface.co/collections/AIDC-AI/AIDC-AI/ovis25-689ec1474633b2aab8809335",
1796
+ "visibility": "public",
1797
+ "access": "accessible"
1798
+ },
1799
+ {
1800
+ "type": "collection",
1801
+ "id": "nvidia/nvidia-nemotron-689f6d6e6ead8e77dd641615",
1802
+ "owner": "nvidia",
1803
+ "title": "NVIDIA Nemotron",
1804
+ "url": "https://huggingface.co/collections/nvidia/nvidia/nvidia-nemotron-689f6d6e6ead8e77dd641615",
1805
+ "visibility": "public",
1806
+ "access": "accessible"
1807
+ },
1808
+ {
1809
+ "type": "collection",
1810
+ "id": "ByteDance-Seed/seed-oss-68a609f4201e788db05b5dcd",
1811
+ "owner": "ByteDance-Seed",
1812
+ "title": "Seed-OSS",
1813
+ "url": "https://huggingface.co/collections/ByteDance-Seed/ByteDance-Seed/seed-oss-68a609f4201e788db05b5dcd",
1814
+ "visibility": "public",
1815
+ "access": "accessible"
1816
+ },
1817
+ {
1818
+ "type": "collection",
1819
+ "id": "google/gemma-3-release-67c6c6f89c4f76621268bb6d",
1820
+ "owner": "google",
1821
+ "title": "Gemma 3 Release",
1822
+ "url": "https://huggingface.co/collections/google/google/gemma-3-release-67c6c6f89c4f76621268bb6d",
1823
+ "visibility": "public",
1824
+ "access": "accessible"
1825
+ },
1826
+ {
1827
+ "type": "collection",
1828
+ "id": "openai/gpt-oss-68911959590a1634ba11c7a4",
1829
+ "owner": "openai",
1830
+ "title": "gpt-oss",
1831
+ "url": "https://huggingface.co/collections/openai/openai/gpt-oss-68911959590a1634ba11c7a4",
1832
+ "visibility": "public",
1833
+ "access": "accessible"
1834
+ },
1835
+ {
1836
+ "type": "collection",
1837
+ "id": "Qwen/qwen3-67dd247413f0e2e4f653967f",
1838
+ "owner": "Qwen",
1839
+ "title": "Qwen3",
1840
+ "url": "https://huggingface.co/collections/Qwen/Qwen/qwen3-67dd247413f0e2e4f653967f",
1841
+ "visibility": "public",
1842
+ "access": "accessible"
1843
+ },
1844
+ {
1845
+ "type": "collection",
1846
+ "id": "nvidia/nemotron-pre-training-dataset-689d9de36f84279d83786b35",
1847
+ "owner": "nvidia",
1848
+ "title": "Nemotron-Pre-Training-Dataset",
1849
+ "url": "https://huggingface.co/collections/nvidia/nvidia/nemotron-pre-training-dataset-689d9de36f84279d83786b35",
1850
+ "visibility": "public",
1851
+ "access": "accessible"
1852
+ },
1853
+ {
1854
+ "type": "collection",
1855
+ "id": "inclusionAI/ui-venus-689f2fb01a4234cbce91c56a",
1856
+ "owner": "inclusionAI",
1857
+ "title": "UI-Venus",
1858
+ "url": "https://huggingface.co/collections/inclusionAI/inclusionAI/ui-venus-689f2fb01a4234cbce91c56a",
1859
+ "visibility": "public",
1860
+ "access": "accessible"
1861
+ }
1862
+ ],
1863
+ "status": 200,
1864
+ "error": ""
1865
+ }
1866
+ --- STDERR ---
1867
+
1868
+ [OK] test_hf_collections_list.py
1869
+ --------------------------------------------------------------------------------
1870
+
1871
+ === Running test_hf_collection_get.py ===
1872
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_collection_get.py
1873
+ --- INPUT (snippet) ---
1874
+ tool.forward(namespace=namespace, slug_id=slug_id)
1875
+ try:
1876
+ data = json.loads(result_json_str)
1877
+ print(json.dumps(data, indent=2, ensure_ascii=False))
1878
+ except Exception:
1879
+ print(result_json_str)
1880
+
1881
+
1882
+ if __name__ == "__main__":
1883
+ main()
1884
+
1885
+
1886
+ --- STDOUT ---
1887
+ {
1888
+ "item": {},
1889
+ "status": 404,
1890
+ "error": "http_404"
1891
+ }
1892
+ --- STDERR ---
1893
+
1894
+ [OK] test_hf_collection_get.py
1895
+ --------------------------------------------------------------------------------
1896
+
1897
+ === Running test_hf_paper_info.py ===
1898
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_paper_info.py
1899
+ --- INPUT (snippet) ---
1900
+ tool.forward(arxiv_id=arxiv_id)
1901
+ try:
1902
+ data = json.loads(result_json_str)
1903
+ print(json.dumps(data, indent=2, ensure_ascii=False))
1904
+ except Exception:
1905
+ print(result_json_str)
1906
+
1907
+
1908
+ if __name__ == "__main__":
1909
+ main()
1910
+
1911
+
1912
+ --- STDOUT ---
1913
+ {
1914
+ "item": {
1915
+ "id": "1706.03762",
1916
+ "authors": [
1917
+ {
1918
+ "_id": "6411c77d6b75ddced38902b6",
1919
+ "user": {
1920
+ "_id": "60fe1b231a3e6f5129776bf9",
1921
+ "avatarUrl": "/avatars/c80edad5267c6ed9ecde9b056993d5c3.svg",
1922
+ "isPro": false,
1923
+ "fullname": "Ashish Vaswani",
1924
+ "user": "ashishvaswanigoogle",
1925
+ "type": "user"
1926
+ },
1927
+ "name": "Ashish Vaswani",
1928
+ "status": "extracted_pending",
1929
+ "statusLastChangedAt": "2023-03-15T09:40:25.803Z",
1930
+ "hidden": false
1931
+ },
1932
+ {
1933
+ "_id": "6411c77d6b75ddced38902b7",
1934
+ "name": "Noam Shazeer",
1935
+ "hidden": false
1936
+ },
1937
+ {
1938
+ "_id": "6411c77d6b75ddced38902b8",
1939
+ "user": {
1940
+ "_id": "60fee8e1465daccb9f332e2f",
1941
+ "avatarUrl": "/avatars/ea92bf15e181b4f13a70f18ee3ba7a51.svg",
1942
+ "isPro": false,
1943
+ "fullname": "Niki Parmar",
1944
+ "user": "nikip",
1945
+ "type": "user"
1946
+ },
1947
+ "name": "Niki Parmar",
1948
+ "status": "extracted_pending",
1949
+ "statusLastChangedAt": "2023-03-15T09:40:25.803Z",
1950
+ "hidden": false
1951
+ },
1952
+ {
1953
+ "_id": "6411c77d6b75ddced38902b9",
1954
+ "name": "Jakob Uszkoreit",
1955
+ "hidden": false
1956
+ },
1957
+ {
1958
+ "_id": "6411c77d6b75ddced38902ba",
1959
+ "name": "Llion Jones",
1960
+ "hidden": false
1961
+ },
1962
+ {
1963
+ "_id": "6411c77d6b75ddced38902bb",
1964
+ "name": "Aidan N. Gomez",
1965
+ "hidden": false
1966
+ },
1967
+ {
1968
+ "_id": "6411c77d6b75ddced38902bc",
1969
+ "name": "Lukasz Kaiser",
1970
+ "hidden": false
1971
+ },
1972
+ {
1973
+ "_id": "6411c77d6b75ddced38902bd",
1974
+ "name": "Illia Polosukhin",
1975
+ "hidden": false
1976
+ }
1977
+ ],
1978
+ "publishedAt": "2017-06-12T17:57:34.000Z",
1979
+ "title": "Attention Is All You Need",
1980
+ "summary": "The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer, based\nsolely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to be\nsuperior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014\nEnglish-to-German translation task, improving over the existing best results,\nincluding ensembles by over 2 BLEU. On the WMT 2014 English-to-French\ntranslation task, our model establishes a new single-model state-of-the-art\nBLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction\nof the training costs of the best models from the literature. We show that the\nTransformer generalizes well to other tasks by applying it successfully to\nEnglish constituency parsing both with large and limited training data.",
1981
+ "upvotes": 79,
1982
+ "discussionId": "641192343ea54b1aa7e2f084",
1983
+ "ai_summary": "The Transformer architecture, based purely on attention mechanisms, achieves superior performance on machine translation and parsing tasks with improved parallelizability and reduced training time.",
1984
+ "ai_keywords": [
1985
+ "recurrent neural networks",
1986
+ "convolutional neural networks",
1987
+ "encoder-decoder configuration",
1988
+ "attention mechanism",
1989
+ "Transformer",
1990
+ "BLEU score",
1991
+ "WMT 2014 English-to-German translation",
1992
+ "WMT 2014 English-to-French translation",
1993
+ "English constituency parsing"
1994
+ ]
1995
+ },
1996
+ "status": 200,
1997
+ "error": ""
1998
+ }
1999
+ --- STDERR ---
2000
+
2001
+ [OK] test_hf_paper_info.py
2002
+ --------------------------------------------------------------------------------
2003
+
2004
+ === Running test_hf_paper_repos.py ===
2005
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_paper_repos.py
2006
+ --- INPUT (snippet) ---
2007
+ tool.forward(arxiv_id=arxiv_id)
2008
+ try:
2009
+ data = json.loads(result_json_str)
2010
+ print(json.dumps(data, indent=2, ensure_ascii=False))
2011
+ except Exception:
2012
+ print(result_json_str)
2013
+
2014
+
2015
+ if __name__ == "__main__":
2016
+ main()
2017
+
2018
+
2019
+ --- STDOUT ---
2020
+ {
2021
+ "results": [],
2022
+ "status": 200,
2023
+ "error": ""
2024
+ }
2025
+ --- STDERR ---
2026
+
2027
+ [OK] test_hf_paper_repos.py
2028
+ --------------------------------------------------------------------------------
2029
+
2030
+ === Running test_hf_daily_papers.py ===
2031
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_daily_papers.py
2032
+ --- INPUT (snippet) ---
2033
+ tool.forward()
2034
+ try:
2035
+ data = json.loads(result_json_str)
2036
+ print(json.dumps(data, indent=2, ensure_ascii=False))
2037
+ except Exception:
2038
+ try:
2039
+ print(result_json_str.encode("utf-8", errors="replace").decode("utf-8", errors="replace"))
2040
+ except Exception:
2041
+ print("<unprintable> due to encoding error")
2042
+
2043
+
2044
+ if __name__ == "__main__":
2045
+ main()
2046
+
2047
+
2048
+ --- STDOUT ---
2049
+ <unprintable> due to encoding error
2050
+ --- STDERR ---
2051
+
2052
+ [OK] test_hf_daily_papers.py
2053
+ --------------------------------------------------------------------------------
2054
+
2055
+ === Running test_hf_repo_info.py ===
2056
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_repo_info.py
2057
+ --- INPUT (snippet) ---
2058
+ tool.forward(repo_type="model", repo_id="bert-base-uncased")
2059
+ try:
2060
+ data = json.loads(result_json_str)
2061
+ print(json.dumps(data, indent=2, ensure_ascii=False))
2062
+ except Exception:
2063
+ print(result_json_str)
2064
+
2065
+
2066
+ if __name__ == "__main__":
2067
+ main()
2068
+
2069
+
2070
+ --- STDOUT ---
2071
+ {
2072
+ "item": {
2073
+ "_id": "621ffdc036468d709f174338",
2074
+ "id": "google-bert/bert-base-uncased",
2075
+ "private": false,
2076
+ "pipeline_tag": "fill-mask",
2077
+ "library_name": "transformers",
2078
+ "tags": [
2079
+ "transformers",
2080
+ "pytorch",
2081
+ "tf",
2082
+ "jax",
2083
+ "rust",
2084
+ "coreml",
2085
+ "onnx",
2086
+ "safetensors",
2087
+ "bert",
2088
+ "fill-mask",
2089
+ "exbert",
2090
+ "en",
2091
+ "dataset:bookcorpus",
2092
+ "dataset:wikipedia",
2093
+ "arxiv:1810.04805",
2094
+ "license:apache-2.0",
2095
+ "autotrain_compatible",
2096
+ "endpoints_compatible",
2097
+ "region:us"
2098
+ ],
2099
+ "downloads": 52064840,
2100
+ "likes": 2385,
2101
+ "modelId": "google-bert/bert-base-uncased",
2102
+ "author": "google-bert",
2103
+ "sha": "86b5e0934494bd15c9632b12f734a8a67f723594",
2104
+ "lastModified": "2024-02-19T11:06:12.000Z",
2105
+ "gated": false,
2106
+ "disabled": false,
2107
+ "mask_token": "[MASK]",
2108
+ "widgetData": [
2109
+ {
2110
+ "text": "Paris is the [MASK] of France."
2111
+ },
2112
+ {
2113
+ "text": "The goal of life is [MASK]."
2114
+ }
2115
+ ],
2116
+ "model-index": null,
2117
+ "config": {
2118
+ "architectures": [
2119
+ "BertForMaskedLM"
2120
+ ],
2121
+ "model_type": "bert",
2122
+ "tokenizer_config": {}
2123
+ },
2124
+ "cardData": {
2125
+ "language": "en",
2126
+ "tags": [
2127
+ "exbert"
2128
+ ],
2129
+ "license": "apache-2.0",
2130
+ "datasets": [
2131
+ "bookcorpus",
2132
+ "wikipedia"
2133
+ ]
2134
+ },
2135
+ "transformersInfo": {
2136
+ "auto_model": "AutoModelForMaskedLM",
2137
+ "pipeline_tag": "fill-mask",
2138
+ "processor": "AutoTokenizer"
2139
+ },
2140
+ "siblings": [
2141
+ {
2142
+ "rfilename": ".gitattributes"
2143
+ },
2144
+ {
2145
+ "rfilename": "LICENSE"
2146
+ },
2147
+ {
2148
+ "rfilename": "README.md"
2149
+ },
2150
+ {
2151
+ "rfilename": "config.json"
2152
+ },
2153
+ {
2154
+ "rfilename": "coreml/fill-mask/float32_model.mlpackage/Data/com.apple.CoreML/model.mlmodel"
2155
+ },
2156
+ {
2157
+ "rfilename": "coreml/fill-mask/float32_model.mlpackage/Data/com.apple.CoreML/weights/weight.bin"
2158
+ },
2159
+ {
2160
+ "rfilename": "coreml/fill-mask/float32_model.mlpackage/Manifest.json"
2161
+ },
2162
+ {
2163
+ "rfilename": "flax_model.msgpack"
2164
+ },
2165
+ {
2166
+ "rfilename": "model.onnx"
2167
+ },
2168
+ {
2169
+ "rfilename": "model.safetensors"
2170
+ },
2171
+ {
2172
+ "rfilename": "pytorch_model.bin"
2173
+ },
2174
+ {
2175
+ "rfilename": "rust_model.ot"
2176
+ },
2177
+ {
2178
+ "rfilename": "tf_model.h5"
2179
+ },
2180
+ {
2181
+ "rfilename": "tokenizer.json"
2182
+ },
2183
+ {
2184
+ "rfilename": "tokenizer_config.json"
2185
+ },
2186
+ {
2187
+ "rfilename": "vocab.txt"
2188
+ }
2189
+ ],
2190
+ "spaces": [
2191
+ "mteb/leaderboard",
2192
+ "microsoft/HuggingGPT",
2193
+ "Vision-CAIR/minigpt4",
2194
+ "lnyan/stablediffusion-infinity",
2195
+ "multimodalart/latentdiffusion",
2196
+ "mrfakename/MeloTTS",
2197
+ "Salesforce/BLIP",
2198
+ "shi-labs/Versatile-Diffusion",
2199
+ "yizhangliu/Grounded-Segment-Anything",
2200
+ "stepfun-ai/Step1X-Edit",
2201
+ "H-Liu1997/TANGO",
2202
+ "xinyu1205/recognize-anything",
2203
+ "cvlab/zero123-live",
2204
+ "hilamanor/audioEditing",
2205
+ "alexnasa/Chain-of-Zoom",
2206
+ "AIGC-Audio/AudioGPT",
2207
+ "Audio-AGI/AudioSep",
2208
+ "m-ric/chunk_visualizer",
2209
+ "jadechoghari/OpenMusic",
2210
+ "DAMO-NLP-SG/Video-LLaMA",
2211
+ "gligen/demo",
2212
+ "declare-lab/mustango",
2213
+ "Yiwen-ntu/MeshAnything",
2214
+ "exbert-project/exbert",
2215
+ "shgao/EditAnything",
2216
+ "LiruiZhao/Diffree",
2217
+ "Vision-CAIR/MiniGPT-v2",
2218
+ "Yuliang/ECON",
2219
+ "nikigoli/countgd",
2220
+ "THUdyh/Oryx",
2221
+ "IDEA-Research/Grounded-SAM",
2222
+ "Awiny/Image2Paragraph",
2223
+ "ShilongLiu/Grounding_DINO_demo",
2224
+ "OpenSound/CapSpeech-TTS",
2225
+ "merve/Grounding_DINO_demo",
2226
+ "yangheng/Super-Resolution-Anime-Diffusion",
2227
+ "liuyuan-pal/SyncDreamer",
2228
+ "XiangJinYu/SPO",
2229
+ "sam-hq-team/sam-hq",
2230
+ "haotiz/glip-zeroshot-demo",
2231
+ "TencentARC/BrushEdit",
2232
+ "Nick088/Audio-SR",
2233
+ "nateraw/lavila",
2234
+ "abyildirim/inst-inpaint",
2235
+ "Yiwen-ntu/MeshAnythingV2",
2236
+ "Pinwheel/GLIP-BLIP-Object-Detection-VQA",
2237
+ "Junfeng5/GLEE_demo",
2238
+ "shi-labs/Matting-Anything",
2239
+ "fffiloni/Video-Matting-Anything",
2240
+ "burtenshaw/autotrain-mcp",
2241
+ "Vision-CAIR/MiniGPT4-video",
2242
+ "linfanluntan/Grounded-SAM",
2243
+ "magicr/BuboGPT",
2244
+ "WensongSong/Insert-Anything",
2245
+ "nvidia/audio-flamingo-2",
2246
+ "multimodalart/MoDA-fast-talking-head",
2247
+ "clip-italian/clip-italian-demo",
2248
+ "OpenGVLab/InternGPT",
2249
+ "mteb/leaderboard_legacy",
2250
+ "hongfz16/3DTopia",
2251
+ "yenniejun/tokenizers-languages",
2252
+ "mmlab-ntu/relate-anything-model",
2253
+ "amphion/PicoAudio",
2254
+ "byeongjun-park/HarmonyView",
2255
+ "keras-io/bert-semantic-similarity",
2256
+ "MirageML/sjc",
2257
+ "fffiloni/vta-ldm",
2258
+ "NAACL2022/CLIP-Caption-Reward",
2259
+ "society-ethics/model-card-regulatory-check",
2260
+ "fffiloni/miniGPT4-Video-Zero",
2261
+ "AIGC-Audio/AudioLCM",
2262
+ "Gladiator/Text-Summarizer",
2263
+ "SVGRender/DiffSketcher",
2264
+ "ethanchern/Anole",
2265
+ "LittleFrog/IntrinsicAnything",
2266
+ "milyiyo/reimagine-it",
2267
+ "ysharma/text-to-image-to-video",
2268
+ "OpenGVLab/VideoChatGPT",
2269
+ "acmc/whatsapp-chats-finetuning-formatter",
2270
+ "ZebangCheng/Emotion-LLaMA",
2271
+ "zakaria-narjis/photo-enhancer",
2272
+ "kaushalya/medclip-roco",
2273
+ "AIGC-Audio/Make_An_Audio",
2274
+ "avid-ml/bias-detection",
2275
+ "sonalkum/GAMA",
2276
+ "topdu/OpenOCR-Demo",
2277
+ "RitaParadaRamos/SmallCapDemo",
2278
+ "llizhx/TinyGPT-V",
2279
+ "codelion/Grounding_DINO_demo",
2280
+ "flosstradamus/FluxMusicGUI",
2281
+ "bartar/tokenizers",
2282
+ "Tinkering/Pytorch-day-prez",
2283
+ "sasha/BiasDetection",
2284
+ "Pusheen/LoCo",
2285
+ "Jingkang/EgoGPT-7B",
2286
+ "flax-community/koclip",
2287
+ "TencentARC/VLog",
2288
+ "ynhe/AskAnything",
2289
+ "Volkopat/SegmentAnythingxGroundingDINO",
2290
+ "phyloforfun/VoucherVision"
2291
+ ],
2292
+ "createdAt": "2022-03-02T23:29:04.000Z",
2293
+ "safetensors": {
2294
+ "parameters": {
2295
+ "F32": 110106428
2296
+ },
2297
+ "total": 110106428
2298
+ },
2299
+ "inference": "warm",
2300
+ "usedStorage": 13397387509
2301
+ },
2302
+ "status": 200,
2303
+ "error": ""
2304
+ }
2305
+ --- STDERR ---
2306
+
2307
+ [OK] test_hf_repo_info.py
2308
+ --------------------------------------------------------------------------------
2309
+
2310
+ === Running test_hf_site_search.py ===
2311
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_site_search.py
2312
+ --- INPUT (snippet) ---
2313
+ tool.forward(query="fine-tuning tutorial", limit=5)
2314
+ try:
2315
+ data = json.loads(result_json_str)
2316
+ print(json.dumps(data, indent=2, ensure_ascii=False))
2317
+ except Exception:
2318
+ print(result_json_str)
2319
+
2320
+
2321
+ if __name__ == "__main__":
2322
+ main()
2323
+
2324
+
2325
+ --- STDOUT ---
2326
+ {
2327
+ "results": [
2328
+ {
2329
+ "type": "site",
2330
+ "title": "Fine-tuning - Hugging Face",
2331
+ "url": "https://huggingface.co/docs/transformers/training",
2332
+ "snippet": "Fine-tuning adapts a pretrained model to a specific task with a smaller specialized dataset. This approach requires far less data and compute compared to training a model from scratch, which makes it a more accessible option for many users. Transformers provides the Trainer API, which offers a comprehensive set of training features, for fine-tuning any of the models on the Hub.",
2333
+ "date": null
2334
+ },
2335
+ {
2336
+ "type": "site",
2337
+ "title": "Fine-tune a pretrained model - Hugging Face",
2338
+ "url": "https://huggingface.co/docs/transformers/v4.18.0/en/training",
2339
+ "snippet": "Before you can fine-tune a pretrained model, download a dataset and prepare it for training. The previous tutorial showed you how to process data for training, and now you get an opportunity to put those skills to the test!",
2340
+ "date": null
2341
+ },
2342
+ {
2343
+ "type": "site",
2344
+ "title": "Let's Fine-Tune Your Model for Function-Calling - Hugging Face",
2345
+ "url": "https://huggingface.co/learn/agents-course/bonus-unit1/fine-tuning",
2346
+ "snippet": "In this tutorial, we will build a function-calling model based on google/gemma-2-2b-it. We choose the fine-tuned model google/gemma-2-2b-it instead of the base model google/gemma-2-2b because the fine-tuned model has been improved for our use-case.",
2347
+ "date": null
2348
+ },
2349
+ {
2350
+ "type": "site",
2351
+ "title": "Fine-Tuning a Vision Language Model (Qwen2-VL-7B) with the Hugging Face ...",
2352
+ "url": "https://huggingface.co/learn/cookbook/fine_tuning_vlm_trl",
2353
+ "snippet": "Phil Schmid's tutorial: an excellent deep dive into fine-tuning multimodal LLMs with TRL. Merve Noyan's smol-vision repository: a collection of engaging notebooks on cutting-edge vision and multimodal AI topics.",
2354
+ "date": null
2355
+ },
2356
+ {
2357
+ "type": "site",
2358
+ "title": "Fine-tuning a pretrained model - Hugging Face",
2359
+ "url": "https://huggingface.co/docs/transformers/v4.15.0/training",
2360
+ "snippet": "In this tutorial, we will show you how to fine-tune a pretrained model from the Transformers library. In TensorFlow, models can be directly trained using Keras and the fit method.",
2361
+ "date": null
2362
+ }
2363
+ ],
2364
+ "status": 200,
2365
+ "error": ""
2366
+ }
2367
+ --- STDERR ---
2368
+
2369
+ [OK] test_hf_site_search.py
2370
+ --------------------------------------------------------------------------------
2371
+
2372
+ === Running test_hf_report_generate.py ===
2373
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_report_generate.py
2374
+ --- INPUT (snippet) ---
2375
+ tool.forward(data_json=json.dumps(data), title="Test Report")
2376
+ print(html[:500]) # print first 500 chars
2377
+
2378
+
2379
+ if __name__ == "__main__":
2380
+ main()
2381
+
2382
+
2383
+ --- STDOUT ---
2384
+ <!DOCTYPE html>
2385
+ <html lang="en">
2386
+ <head>
2387
+ <meta charset="utf-8" />
2388
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
2389
+ <title>Test Report</title>
2390
+ <style>
2391
+ :root { --bg:#0b0d12; --fg:#e6e9ef; --muted:#9aa4b2; --card:#121621; --accent:#5ac8fa; --warn:#eab308; }
2392
+ body { background:var(--bg); color:var(--fg); font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Inter, Arial, sans-serif; margin:0; padding:24px; }
2393
+ h1 { font-size: 24px; margin: 0 0
2394
+ --- STDERR ---
2395
+
2396
+ [OK] test_hf_report_generate.py
2397
+ --------------------------------------------------------------------------------
2398
+
2399
+ === Running test_hf_generate_dashboard_report.py ===
2400
+ Command: C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\venv\Scripts\python.exe C:\Users\daqc\Documents\GitHub\open-deep-research-vulnerability-intelligence\tests\test_hf_generate_dashboard_report.py
2401
+ --- INPUT (snippet) ---
2402
+ tool.forward(query="semantic search", limit=5)
2403
+ print(html[:500]) # print first 500 chars
2404
+
2405
+
2406
+ if __name__ == "__main__":
2407
+ main()
2408
+
2409
+
2410
+ --- STDOUT ---
2411
+ <!DOCTYPE html>
2412
+ <html lang="en">
2413
+ <head>
2414
+ <meta charset="utf-8" />
2415
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
2416
+ <title>Hugging Search � Dashboard</title>
2417
+ <style>
2418
+ :root { --bg:#0b0d12; --fg:#e6e9ef; --muted:#9aa4b2; --card:#121621; --accent:#5ac8fa; --warn:#eab308; }
2419
+ body { background:var(--bg); color:var(--fg); font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Inter, Arial, sans-serif; margin:0; padding:24px; }
2420
+ .container { max-wi
2421
+ --- STDERR ---
2422
+
2423
+ [OK] test_hf_generate_dashboard_report.py
2424
+ --------------------------------------------------------------------------------
2425
+
tests/run_all_hf_tools_tests.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+ import os
4
+ from datetime import datetime
5
+
6
+ TESTS = [
7
+ "test_hf_models_search.py",
8
+ "test_hf_model_info.py",
9
+ "test_hf_datasets_search.py",
10
+ "test_hf_dataset_info.py",
11
+ "test_hf_spaces_search.py",
12
+ "test_hf_space_info.py",
13
+ "test_hf_user_info.py",
14
+ "test_hf_collections_list.py",
15
+ "test_hf_collection_get.py",
16
+ "test_hf_paper_info.py",
17
+ "test_hf_paper_repos.py",
18
+ "test_hf_daily_papers.py",
19
+ "test_hf_repo_info.py",
20
+ "test_hf_site_search.py",
21
+ "test_hf_report_generate.py",
22
+ "test_hf_generate_dashboard_report.py",
23
+ ]
24
+
25
+
26
+ def main():
27
+ base = os.path.dirname(__file__)
28
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
29
+ out_path = os.path.join(base, f"hf_tools_tests_output_{timestamp}.txt")
30
+
31
+ ok = True
32
+ with open(out_path, "w", encoding="utf-8", errors="replace") as f:
33
+ f.write(f"Hugging Face Tools Test Run — {timestamp}\n")
34
+ f.write("=" * 80 + "\n\n")
35
+ for t in TESTS:
36
+ path = os.path.join(base, t)
37
+ header = f"=== Running {t} ===\n"
38
+ print("\n" + header, end="")
39
+ f.write(header)
40
+ # Write simple INPUT info: command and forward() snippet if found
41
+ f.write(f"Command: {sys.executable} {path}\n")
42
+ try:
43
+ with open(path, "r", encoding="utf-8", errors="replace") as tf:
44
+ src = tf.read()
45
+ snippet = ""
46
+ key = "tool.forward("
47
+ idx = src.find(key)
48
+ if idx != -1:
49
+ end = idx
50
+ # capture up to 500 chars from forward( to show parameters
51
+ snippet = src[idx: idx + 500]
52
+ else:
53
+ # fallback: show main() body first 400 chars
54
+ m_idx = src.find("def main():")
55
+ snippet = src[m_idx: m_idx + 400] if m_idx != -1 else src[:400]
56
+ f.write("--- INPUT (snippet) ---\n")
57
+ f.write(snippet)
58
+ if not snippet.endswith("\n"):
59
+ f.write("\n")
60
+ except Exception as e:
61
+ f.write(f"--- INPUT (unavailable): {e}\n")
62
+ try:
63
+ result = subprocess.run(
64
+ [sys.executable, path],
65
+ stdout=subprocess.PIPE,
66
+ stderr=subprocess.PIPE,
67
+ text=True,
68
+ encoding="utf-8",
69
+ errors="replace",
70
+ )
71
+ f.write("--- STDOUT ---\n")
72
+ f.write(result.stdout or "")
73
+ if not result.stdout.endswith("\n"):
74
+ f.write("\n")
75
+ f.write("--- STDERR ---\n")
76
+ f.write(result.stderr or "")
77
+ if not result.stderr.endswith("\n"):
78
+ f.write("\n")
79
+ status_line = f"[OK] {t}\n" if result.returncode == 0 else f"[FAIL] {t}\n"
80
+ f.write(status_line)
81
+ f.write("-" * 80 + "\n\n")
82
+ if result.returncode != 0:
83
+ ok = False
84
+ print(status_line.strip())
85
+ except Exception as e:
86
+ ok = False
87
+ err_line = f"[ERROR] {t}: {e}\n"
88
+ f.write(err_line)
89
+ f.write("-" * 80 + "\n\n")
90
+ print(err_line.strip())
91
+
92
+ print(f"\nResults saved to: {out_path}")
93
+ sys.exit(0 if ok else 1)
94
+
95
+
96
+ if __name__ == "__main__":
97
+ main()
98
+
99
+
tests/test_hf_collection_get.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import sys
4
+
5
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
6
+
7
+ from scripts.hf_tools import HFCollectionGetTool
8
+
9
+
10
+ def main():
11
+ tool = HFCollectionGetTool()
12
+ # Example namespace/slug - replace with a public collection if needed
13
+ namespace = "huggingface"
14
+ slug_id = "trending-models" # may vary; adjust to an existing collection
15
+ result_json_str = tool.forward(namespace=namespace, slug_id=slug_id)
16
+ try:
17
+ data = json.loads(result_json_str)
18
+ print(json.dumps(data, indent=2, ensure_ascii=False))
19
+ except Exception:
20
+ print(result_json_str)
21
+
22
+
23
+ if __name__ == "__main__":
24
+ main()
25
+
26
+
tests/test_hf_collections_list.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import sys
4
+
5
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
6
+
7
+ from scripts.hf_tools import HFCollectionsListTool
8
+
9
+
10
+ def main():
11
+ tool = HFCollectionsListTool()
12
+ result_json_str = tool.forward(owner=None) # or set an owner like "huggingface"
13
+ try:
14
+ data = json.loads(result_json_str)
15
+ print(json.dumps(data, indent=2, ensure_ascii=False))
16
+ except Exception:
17
+ print(result_json_str)
18
+
19
+
20
+ if __name__ == "__main__":
21
+ main()
22
+
23
+
tests/test_hf_daily_papers.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import sys
4
+
5
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
6
+
7
+ from scripts.hf_tools import HFDailyPapersTool
8
+
9
+
10
+ def main():
11
+ tool = HFDailyPapersTool()
12
+ result_json_str = tool.forward()
13
+ try:
14
+ data = json.loads(result_json_str)
15
+ print(json.dumps(data, indent=2, ensure_ascii=False))
16
+ except Exception:
17
+ try:
18
+ print(result_json_str.encode("utf-8", errors="replace").decode("utf-8", errors="replace"))
19
+ except Exception:
20
+ print("<unprintable> due to encoding error")
21
+
22
+
23
+ if __name__ == "__main__":
24
+ main()
25
+
26
+
tests/test_hf_dataset_info.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import sys
4
+
5
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
6
+
7
+ from scripts.hf_tools import HFDatasetInfoTool
8
+
9
+
10
+ def main():
11
+ tool = HFDatasetInfoTool()
12
+ repo_id = "glue"
13
+ result_json_str = tool.forward(repo_id=repo_id)
14
+ try:
15
+ data = json.loads(result_json_str)
16
+ print(json.dumps(data, indent=2, ensure_ascii=False))
17
+ except Exception:
18
+ print(result_json_str)
19
+
20
+
21
+ if __name__ == "__main__":
22
+ main()
23
+
24
+
tests/test_hf_datasets_search.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import sys
4
+
5
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
6
+
7
+ from scripts.hf_tools import HFDatasetsSearchTool
8
+
9
+
10
+ def main():
11
+ tool = HFDatasetsSearchTool()
12
+ result_json_str = tool.forward(
13
+ query="sentiment analysis",
14
+ tags="language:es",
15
+ sort="downloads",
16
+ direction="descending",
17
+ limit=5,
18
+ )
19
+ try:
20
+ data = json.loads(result_json_str)
21
+ print(json.dumps(data, indent=2, ensure_ascii=False))
22
+ except Exception:
23
+ print(result_json_str)
24
+
25
+
26
+ if __name__ == "__main__":
27
+ main()
28
+
29
+
tests/test_hf_generate_dashboard_report.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
5
+
6
+ from scripts.hf_tools import HFDashboardReportTool
7
+
8
+
9
+ def main():
10
+ tool = HFDashboardReportTool()
11
+ html = tool.forward(query="semantic search", limit=5)
12
+ print(html[:500]) # print first 500 chars
13
+
14
+
15
+ if __name__ == "__main__":
16
+ main()
17
+
18
+