Spaces:
Paused
Paused
import os | |
import nltk | |
import pytest | |
from tests.utils import wrap_test_forked | |
def nltkTokenize(text): | |
words = nltk.word_tokenize(text) | |
return words | |
import re | |
WORD = re.compile(r'\w+') | |
def regTokenize(text): | |
words = WORD.findall(text) | |
return words | |
import time | |
def test_tokenizer1(): | |
prompt = """Here is an example of how to write a Python program to generate the Fibonacci sequence: | |
def fib(n): | |
a, b = 0, 1 | |
if n == 0 or n == 1: | |
return a | |
for i in range(n-2): | |
a, b = b, a+b | |
return b | |
for i in range(10): | |
print(fib(i)) | |
This program defines a function called fib that takes an integer n as input and returns the nth Fibonacci number. The function uses two variables a and b to keep track of the current and previous Fibonacci numbers. | |
The first two lines of the function check if n is either 0 or 1, in which case the function returns 0 or 1 respectively. If n is greater than 1, the function iterates over the range of integers from 2 to n-1, adding the previous two Fibonacci numbers to get the current Fibonacci number. Finally, the function returns the last Fibonacci number calculated. | |
In the main part of the program, we use a for loop to call the fib function with different""" | |
prompt = os.getenv('PROMPT', prompt) | |
run_tokenizer1(prompt) | |
def run_tokenizer1(prompt): | |
from transformers import AutoTokenizer | |
t = AutoTokenizer.from_pretrained("distilgpt2") | |
llm_tokenizer = AutoTokenizer.from_pretrained('h2oai/h2ogpt-oig-oasst1-512-6_9b') | |
from InstructorEmbedding import INSTRUCTOR | |
emb = INSTRUCTOR('hkunlp/instructor-large') | |
t0 = time.time() | |
a = len(regTokenize(prompt)) | |
print("Regexp Tokenizer", a, time.time() - t0) | |
t0 = time.time() | |
a = len(nltkTokenize(prompt)) | |
print("NLTK Tokenizer", a, time.time() - t0) | |
t0 = time.time() | |
a = len(t(prompt)['input_ids']) | |
print("Slow Tokenizer", a, time.time() - t0) | |
t0 = time.time() | |
a = len(llm_tokenizer(prompt)['input_ids']) | |
print("Fast Tokenizer LLM", a, time.time() - t0) | |
t0 = time.time() | |
a = emb.tokenize([prompt])['input_ids'].shape[1] | |
print("Instruct Embedding", a, time.time() - t0) | |
def test_fake_tokenizer(): | |
from src.utils import FakeTokenizer | |
t = FakeTokenizer() | |
assert t.num_tokens_from_string('How are you?') == 4 | |
assert t.num_tokens_from_string('<|endoftext|>') == 7 | |
try: | |
t.encoding.encode('<|endoftext|>') | |
raise RuntimeError("Shouldn't reach here") | |
except ValueError as e: | |
assert "disallowed special token" in str(e) | |
if __name__ == '__main__': | |
test_tokenizer1() | |