Spaces:
Sleeping
Sleeping
import gradio as gr | |
import polars as pl | |
# favourite_langs = {"English": "en", "Romanian": "ro", "German": "de", "-----": "-----"} | |
favourite_langs = {"English": "en", "Romanian": "ro", "German": "de"} | |
options = list(favourite_langs.keys()) | |
models = ['ENRO', 'DERO'] | |
# English, Romanian | |
def search_text(input_text, sselected_language, tselected_language, model_name, hits, toggle_case): | |
# df = pl.read_csv('hf://datasets/TiberiuCristianLeon/2RO/ENRO/ENRO.tsv', separator='\t') | |
# df = pl.read_parquet('hf://datasets/TiberiuCristianLeon/RSSNEWS/data/train-00000-of-00001.parquet') | |
# df = pl.read_parquet('https://huggingface.co/datasets/TiberiuCristianLeon/2RO/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet') | |
path_to_model = f"https://huggingface.co/api/datasets/TiberiuCristianLeon/2RO/parquet/{model_name.lower()}/train/0.parquet" | |
df = pl.read_parquet(path_to_model) | |
# Filter rows | |
# df.filter(pl.col(sselected_language).str.contains(input_text)).head(hits) | |
# print(df.head(hits)) | |
if toggle_case: | |
filtered = df.filter(pl.col(sselected_language).str.contains(input_text).alias("literal")) # case sensitive | |
else: | |
filtered = df.filter(pl.col(sselected_language).str.contains(f"(?i){input_text}").alias("literal")) # (?i) case insensitive | |
# filtered = df.filter(pl.col(sselected_language).str.contains_any([input_text], ascii_case_insensitive=True).alias("contains_any")) | |
print(toggle_case, filtered.head(hits)) | |
# print(filtered) | |
# Extract rows | |
list_of_arrays = filtered.select([sselected_language, tselected_language]).head(hits) | |
# for dataframe type="numpy" | |
# list_of_arrays = filtered.select([sselected_language, tselected_language]).head(hits).to_numpy() | |
message_text = f'Done! Found {len(list_of_arrays)} entries' | |
return list_of_arrays, message_text | |
# Define a function to swap dropdown values | |
def swap_languages(src_lang, tgt_lang): | |
return tgt_lang, src_lang | |
def create_interface(): | |
with gr.Blocks() as interface: | |
gr.Markdown("## Search Text in Dataset") | |
with gr.Row(): | |
input_text = gr.Textbox(label="Enter text to search:", placeholder="Type your text here...", info="Press Enter key to start search") | |
with gr.Row(): | |
sselected_language = gr.Dropdown(choices=options, value = options[0], label="Source language", interactive=True) | |
tselected_language = gr.Dropdown(choices=options, value = options[1], label="Target language", interactive=True) | |
swap_button = gr.Button("Swap Languages") | |
swap_button.click(fn=swap_languages, inputs=[sselected_language, tselected_language], outputs=[sselected_language, tselected_language]) | |
toggle_case = gr.Checkbox(info="Case sensitive search", label="Toggle case sensitive search", value=True, interactive=True, visible=True) | |
model_name = gr.Dropdown(choices=models, label="Select a dataset", value = models[0], interactive=True) | |
search_button = gr.Button("Search") | |
translated_text = gr.Dataframe(label="Returned entries:", interactive=False, headers=[options[0], options [1]], datatype=["str", "str"], col_count=(2, "fixed"), | |
type="polars", wrap=True, show_row_numbers=False, show_copy_button=True) | |
message_text = gr.Textbox(label="Messages:", placeholder="Display field for status and error messages", interactive=False) | |
hits = gr.Slider( | |
minimum=1, | |
maximum=100, | |
value=10, | |
step=5, | |
label="Number of returned hits") | |
search_button.click( | |
search_text, | |
inputs=[input_text, sselected_language, tselected_language, model_name, hits, toggle_case], | |
outputs=[translated_text, message_text] | |
) | |
# Submit the form when Enter is pressed in the input_text textbox | |
input_text.submit( | |
search_text, | |
inputs=[input_text, sselected_language, tselected_language, model_name, hits, toggle_case], | |
outputs=[translated_text, message_text] | |
) | |
return interface | |
if __name__ == "__main__": | |
interface = create_interface() | |
interface.launch() | |