Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -2,11 +2,20 @@ from gradio import Interface | |
| 2 | 
             
            import gradio as gr
         | 
| 3 | 
             
            import aranizer
         | 
| 4 | 
             
            from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 5 |  | 
| 6 | 
             
            # List of available tokenizers and a dictionary to load them
         | 
| 7 | 
             
            tokenizer_options = [
         | 
| 8 | 
             
                "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
         | 
| 9 | 
            -
                "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k"
         | 
|  | |
|  | |
|  | |
| 10 | 
             
            ]
         | 
| 11 |  | 
| 12 | 
             
            tokenizers = {
         | 
| @@ -17,33 +26,49 @@ tokenizers = { | |
| 17 | 
             
                "aranizer_sp50k": aranizer_sp50k.get_tokenizer,
         | 
| 18 | 
             
                "aranizer_sp64k": aranizer_sp64k.get_tokenizer,
         | 
| 19 | 
             
                "aranizer_sp86k": aranizer_sp86k.get_tokenizer,
         | 
|  | |
|  | |
|  | |
| 20 | 
             
            }
         | 
| 21 |  | 
| 22 | 
             
            def compare_tokenizers(tokenizer_name, text):
         | 
| 23 | 
            -
                #  | 
| 24 | 
            -
                 | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 28 |  | 
| 29 | 
             
                # Prepare the results to be displayed
         | 
| 30 | 
             
                results = [(tokenizer_name, tokens, encoded_output, decoded_text)]
         | 
| 31 | 
             
                return results
         | 
|  | |
|  | |
| 32 | 
             
            inputs_component = [
         | 
| 33 | 
             
                gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
         | 
| 34 | 
            -
                gr.Textbox(lines=2, placeholder="Enter  | 
| 35 | 
             
            ]
         | 
| 36 |  | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
|  | |
| 41 |  | 
| 42 | 
            -
            # Setting up the interface | 
| 43 | 
            -
            iface = Interface( | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
|  | |
|  | |
|  | |
| 47 |  | 
| 48 | 
             
            # Launching the Gradio app
         | 
| 49 | 
            -
            iface.launch()
         | 
|  | |
| 2 | 
             
            import gradio as gr
         | 
| 3 | 
             
            import aranizer
         | 
| 4 | 
             
            from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
         | 
| 5 | 
            +
            from transformers import AutoTokenizer
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            # Load additional tokenizers from transformers
         | 
| 8 | 
            +
            gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
         | 
| 9 | 
            +
            gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
         | 
| 10 | 
            +
            jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
         | 
| 11 |  | 
| 12 | 
             
            # List of available tokenizers and a dictionary to load them
         | 
| 13 | 
             
            tokenizer_options = [
         | 
| 14 | 
             
                "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
         | 
| 15 | 
            +
                "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
         | 
| 16 | 
            +
                "FreedomIntelligence/AceGPT-13B",  # Previously added GPT tokenizer
         | 
| 17 | 
            +
                "FreedomIntelligence/AceGPT-7B",   # Another previously added GPT tokenizer
         | 
| 18 | 
            +
                "inception-mbzuai/jais-13b"        # Adding the new tokenizer to the options
         | 
| 19 | 
             
            ]
         | 
| 20 |  | 
| 21 | 
             
            tokenizers = {
         | 
|  | |
| 26 | 
             
                "aranizer_sp50k": aranizer_sp50k.get_tokenizer,
         | 
| 27 | 
             
                "aranizer_sp64k": aranizer_sp64k.get_tokenizer,
         | 
| 28 | 
             
                "aranizer_sp86k": aranizer_sp86k.get_tokenizer,
         | 
| 29 | 
            +
                "FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
         | 
| 30 | 
            +
                "FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
         | 
| 31 | 
            +
                "inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer  # Adding the new Jais tokenizer
         | 
| 32 | 
             
            }
         | 
| 33 |  | 
| 34 | 
             
            def compare_tokenizers(tokenizer_name, text):
         | 
| 35 | 
            +
                # Handle the transformer tokenizers separately due to API differences
         | 
| 36 | 
            +
                if tokenizer_name in ["FreedomIntelligence/AceGPT-13B", "FreedomIntelligence/AceGPT-7B", "inception-mbzuai/jais-13b"]:
         | 
| 37 | 
            +
                    tokenizer = tokenizers[tokenizer_name]()
         | 
| 38 | 
            +
                    tokens = tokenizer.tokenize(text)
         | 
| 39 | 
            +
                    encoded_output = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
         | 
| 40 | 
            +
                    decoded_text = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
         | 
| 41 | 
            +
                else:
         | 
| 42 | 
            +
                    # AraNizer tokenizers
         | 
| 43 | 
            +
                    tokenizer = tokenizers[tokenizer_name]()
         | 
| 44 | 
            +
                    tokens = tokenizer.tokenize(text)
         | 
| 45 | 
            +
                    encoded_output = tokenizer.encode(text, add_special_tokens=True)
         | 
| 46 | 
            +
                    decoded_text = tokenizer.decode(encoded_output)
         | 
| 47 |  | 
| 48 | 
             
                # Prepare the results to be displayed
         | 
| 49 | 
             
                results = [(tokenizer_name, tokens, encoded_output, decoded_text)]
         | 
| 50 | 
             
                return results
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            # Define the Gradio interface components with a dropdown for model selection
         | 
| 53 | 
             
            inputs_component = [
         | 
| 54 | 
             
                gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
         | 
| 55 | 
            +
                gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text")
         | 
| 56 | 
             
            ]
         | 
| 57 |  | 
| 58 | 
            +
            outputs_component = gr.Dataframe(
         | 
| 59 | 
            +
                headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"], 
         | 
| 60 | 
            +
                label="Results",
         | 
| 61 | 
            +
                type="pandas"
         | 
| 62 | 
            +
            )
         | 
| 63 |  | 
| 64 | 
            +
            # Setting up the interface
         | 
| 65 | 
            +
            iface = Interface(
         | 
| 66 | 
            +
                fn=compare_tokenizers, 
         | 
| 67 | 
            +
                inputs=inputs_component, 
         | 
| 68 | 
            +
                outputs=outputs_component, 
         | 
| 69 | 
            +
                title="Tokenizer Comparison",
         | 
| 70 | 
            +
                live=True
         | 
| 71 | 
            +
            )
         | 
| 72 |  | 
| 73 | 
             
            # Launching the Gradio app
         | 
| 74 | 
            +
            iface.launch()
         | 
