Neu256 commited on
Commit
ad52a00
·
verified ·
1 Parent(s): 5818a55

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +88 -88
utils.py CHANGED
@@ -1,88 +1,88 @@
1
- import os
2
- import torch
3
- from datetime import datetime
4
-
5
- # hyperparameters
6
- BATCH_SIZE = 32 # how many independent sequences will we process in parallel?
7
- BLOCK_SIZE = 128 # what is the maximum context length for predictions?
8
- MAX_ITER = 2 # number of training iterations
9
- EVAL_INTER = 1
10
- LEARNING_RATE = 1e-5
11
- EPS = 1e-5
12
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
13
- NUM_HEAD = 6
14
- NUM_EMBED = NUM_HEAD * 128
15
- NUM_LAYER = 8
16
- DROPOUT = 0.3
17
- MAX_SEQ_LEN = 2048
18
-
19
- def encode(text_seq: str, tokenizer: any) -> torch.Tensor:
20
- """
21
- Function to encode input text using a pre-trained tokenizer and vectorized lookups
22
- """
23
- # tokenize the input text
24
- tokens = tokenizer.tokenize(text_seq)
25
- # convert the tokens to their corresponding ids
26
- token_indices = tokenizer.convert_tokens_to_ids(tokens)
27
- token_indices = torch.tensor(token_indices, dtype=torch.long)
28
- return token_indices
29
-
30
-
31
- def decode(enc_sec: torch.Tensor, tokenizer: any) -> str:
32
- """
33
- Function to decode a sequence of token indices back to a string
34
- """
35
- # convert the indices to a list
36
- enc_sec = enc_sec.tolist()
37
- # decode the indices to a string
38
- text = tokenizer.decode(enc_sec)
39
- return text
40
-
41
-
42
- def get_batch(data: list[str], block_size: int, batch_size: int):
43
- """
44
- This is a simple function to create batches of data.
45
- GPUs allow for parallel processing we can feed multiple chunks at once
46
- so that's why we would need batches - how many independant sequences
47
- will we process in parallel.
48
-
49
- Parameters:
50
- data: list[str]: data to take batch from
51
- block_size (int): size of the text that is proccessed at once
52
- batch_size (int): number of sequences to process in parallel
53
-
54
- Returns:
55
- x, y: a tuple with token sequence and token target
56
- """
57
-
58
- ix = torch.randint(len(data) - block_size, (batch_size, ))
59
- # we stack batch_size rows of sentences
60
- # so x and y are the matrices with rows_num=batch_size
61
- # and col_num=block_size
62
- x = torch.stack([data[i : i + block_size] for i in ix])
63
- # y is x shifted one position right - because we predict
64
- # word in y having all the previous words as context
65
- y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
66
- x, y = x.to(DEVICE), y.to(DEVICE)
67
- return x, y
68
-
69
-
70
- @torch.no_grad()
71
- def estimate_loss(
72
- val_loader,
73
- model: torch.nn.Module,
74
- eval_iters: int = 10
75
- ):
76
- out = {}
77
- model.eval()
78
- losses = torch.zeros(eval_iters)
79
- k = 0
80
- for x, y in val_loader:
81
- if k >= eval_iters:
82
- break
83
- logits, loss = model.forward(x, y)
84
- losses[k] = loss.item()
85
- k += 1
86
- out = losses.mean()
87
- model.train()
88
- return out
 
1
+ import os
2
+ import torch
3
+ from datetime import datetime
4
+
5
+ # hyperparameters
6
+ BATCH_SIZE = 64 # how many independent sequences will we process in parallel?
7
+ BLOCK_SIZE = 128 # what is the maximum context length for predictions?
8
+ MAX_ITER = 2 # number of training iterations
9
+ EVAL_INTER = 1
10
+ LEARNING_RATE = 1e-5
11
+ EPS = 1e-5
12
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
13
+ NUM_HEAD = 6
14
+ NUM_EMBED = NUM_HEAD * 128
15
+ NUM_LAYER = 8
16
+ DROPOUT = 0.3
17
+ MAX_SEQ_LEN = 2048
18
+
19
+ def encode(text_seq: str, tokenizer: any) -> torch.Tensor:
20
+ """
21
+ Function to encode input text using a pre-trained tokenizer and vectorized lookups
22
+ """
23
+ # tokenize the input text
24
+ tokens = tokenizer.tokenize(text_seq)
25
+ # convert the tokens to their corresponding ids
26
+ token_indices = tokenizer.convert_tokens_to_ids(tokens)
27
+ token_indices = torch.tensor(token_indices, dtype=torch.long)
28
+ return token_indices
29
+
30
+
31
+ def decode(enc_sec: torch.Tensor, tokenizer: any) -> str:
32
+ """
33
+ Function to decode a sequence of token indices back to a string
34
+ """
35
+ # convert the indices to a list
36
+ enc_sec = enc_sec.tolist()
37
+ # decode the indices to a string
38
+ text = tokenizer.decode(enc_sec)
39
+ return text
40
+
41
+
42
+ def get_batch(data: list[str], block_size: int, batch_size: int):
43
+ """
44
+ This is a simple function to create batches of data.
45
+ GPUs allow for parallel processing we can feed multiple chunks at once
46
+ so that's why we would need batches - how many independant sequences
47
+ will we process in parallel.
48
+
49
+ Parameters:
50
+ data: list[str]: data to take batch from
51
+ block_size (int): size of the text that is proccessed at once
52
+ batch_size (int): number of sequences to process in parallel
53
+
54
+ Returns:
55
+ x, y: a tuple with token sequence and token target
56
+ """
57
+
58
+ ix = torch.randint(len(data) - block_size, (batch_size, ))
59
+ # we stack batch_size rows of sentences
60
+ # so x and y are the matrices with rows_num=batch_size
61
+ # and col_num=block_size
62
+ x = torch.stack([data[i : i + block_size] for i in ix])
63
+ # y is x shifted one position right - because we predict
64
+ # word in y having all the previous words as context
65
+ y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
66
+ x, y = x.to(DEVICE), y.to(DEVICE)
67
+ return x, y
68
+
69
+
70
+ @torch.no_grad()
71
+ def estimate_loss(
72
+ val_loader,
73
+ model: torch.nn.Module,
74
+ eval_iters: int = 10
75
+ ):
76
+ out = {}
77
+ model.eval()
78
+ losses = torch.zeros(eval_iters)
79
+ k = 0
80
+ for x, y in val_loader:
81
+ if k >= eval_iters:
82
+ break
83
+ logits, loss = model.forward(x, y)
84
+ losses[k] = loss.item()
85
+ k += 1
86
+ out = losses.mean()
87
+ model.train()
88
+ return out