lukeingawesome commited on
Commit
6c19590
·
verified ·
1 Parent(s): af8b7ea

Upload LLM2Vec4CXR fine-tuned model

Browse files
Files changed (2) hide show
  1. README.md +34 -60
  2. usage_example.py +9 -18
README.md CHANGED
@@ -62,8 +62,6 @@ pip install -e .
62
  ### Basic Usage
63
 
64
  ```python
65
- import torch
66
- import torch.nn.functional as F
67
  from llm2vec_wrapper import LLM2VecWrapper as LLM2Vec
68
 
69
  # Load the model
@@ -75,74 +73,50 @@ model = LLM2Vec.from_pretrained(
75
  torch_dtype=torch.bfloat16,
76
  )
77
 
78
- # Configure tokenizer
79
- tokenizer = model.tokenizer
80
- tokenizer.padding_side = 'left'
 
 
 
 
 
 
 
 
 
81
 
82
- # Example usage for chest X-ray report analysis
83
- def encode_text(text):
84
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
85
- # IMPORTANT: Add embed_mask for proper model functioning
86
- # For simple text encoding, embed_mask is the same as attention_mask
87
- inputs["embed_mask"] = inputs["attention_mask"].clone()
88
- with torch.no_grad():
89
- embeddings = model(inputs)
90
- return embeddings
91
 
92
- # Example with medical text
93
- report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
94
- embedding = encode_text(report)
 
 
 
 
 
 
95
  ```
96
 
97
- **Note**: The model requires an `embed_mask` input. For simple text encoding, set `embed_mask` equal to `attention_mask`. For instruction-following tasks, use the separator-based tokenization shown below.
98
 
99
- ### Advanced Usage with Separator-based Processing
100
 
101
- The model supports special separator-based processing for instruction-following tasks:
102
 
103
  ```python
104
- def tokenize_with_separator(texts, tokenizer, max_length):
105
- """Tokenize texts with special handling for separator-based splitting."""
106
- texts_2 = []
107
- original_texts = []
108
- separator = '!@#$%^&*()'
109
-
110
- for text in texts:
111
- parts = text.split(separator)
112
- texts_2.append(parts[1] if len(parts) > 1 else "")
113
- original_texts.append("".join(parts))
114
-
115
- tokenized = tokenizer(
116
- original_texts,
117
- return_tensors="pt",
118
- padding=True,
119
- truncation=True,
120
- max_length=max_length,
121
- )
122
 
123
- # Create embedding masks for the separated parts
124
- embed_mask = None
125
- for t_i, t in enumerate(texts_2):
126
- ids = tokenizer([t], return_tensors="pt", padding=True, truncation=True,
127
- max_length=max_length, add_special_tokens=False)
128
- e_m = torch.zeros_like(tokenized["attention_mask"][t_i])
129
- if len(ids["input_ids"][0]) > 0:
130
- e_m[-len(ids["input_ids"][0]):] = torch.ones(len(ids["input_ids"][0]))
131
- if embed_mask is None:
132
- embed_mask = e_m.unsqueeze(0)
133
- else:
134
- embed_mask = torch.cat((embed_mask, e_m.unsqueeze(0)), dim=0)
135
-
136
- tokenized["embed_mask"] = embed_mask
137
- return tokenized
138
-
139
- # Example with instruction and report
140
- separator = '!@#$%^&*()'
141
- instruction = 'Determine the change or the status of the pleural effusion.'
142
- report = 'There is a small increase in the left-sided effusion.'
143
- text = instruction + separator + report
144
 
145
- tokenized = tokenize_with_separator([text], tokenizer, 512)
 
146
  embedding = model(tokenized)
147
  ```
148
 
 
62
  ### Basic Usage
63
 
64
  ```python
 
 
65
  from llm2vec_wrapper import LLM2VecWrapper as LLM2Vec
66
 
67
  # Load the model
 
73
  torch_dtype=torch.bfloat16,
74
  )
75
 
76
+ # Simple text encoding (built-in method)
77
+ report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
78
+ embedding = model.encode_text(report)
79
+
80
+ # Multiple texts at once
81
+ reports = [
82
+ "No acute cardiopulmonary abnormality.",
83
+ "Small bilateral pleural effusions.",
84
+ "Large left pleural effusion with compressive atelectasis."
85
+ ]
86
+ embeddings = model.encode_text(reports)
87
+ ```
88
 
89
+ ### Advanced Usage with Instructions
 
 
 
 
 
 
 
 
90
 
91
+ ```python
92
+ # For instruction-following tasks with separator
93
+ separator = '!@#$%^&*()'
94
+ instruction = 'Determine the change or the status of the pleural effusion.'
95
+ report = 'There is a small increase in the left-sided effusion.'
96
+ text_with_instruction = instruction + separator + report
97
+
98
+ # Use the built-in method for instruction-based encoding
99
+ embedding = model.encode_with_instruction([text_with_instruction])
100
  ```
101
 
102
+ **Note**: The model now includes convenient `encode_text()` and `encode_with_instruction()` methods that handle the `embed_mask` automatically.
103
 
104
+ ### Manual Usage (if you need more control)
105
 
106
+ If you need more control over the tokenization process, you can still use the manual approach:
107
 
108
  ```python
109
+ # Manual tokenization with embed_mask
110
+ def encode_text_manual(model, text):
111
+ inputs = model.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
112
+ inputs["embed_mask"] = inputs["attention_mask"].clone() # Required for proper functioning
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
+ with torch.no_grad():
115
+ embeddings = model(inputs)
116
+ return embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+ # For instruction-based tasks, use the built-in tokenize_with_separator method
119
+ tokenized = model.tokenize_with_separator([text_with_instruction])
120
  embedding = model(tokenized)
121
  ```
122
 
usage_example.py CHANGED
@@ -154,20 +154,15 @@ def main():
154
  model = model.to(device).to(torch.bfloat16)
155
  model.eval()
156
 
157
- # Example 1: Basic text embedding
158
  print("\n" + "="*60)
159
- print("Example 1: Basic Text Embedding")
160
  print("="*60)
161
 
162
  report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
163
 
164
- inputs = tokenizer(report, return_tensors="pt", padding=True, truncation=True, max_length=512)
165
- # Add embed_mask (same as attention_mask for simple text encoding)
166
- inputs["embed_mask"] = inputs["attention_mask"].clone()
167
- inputs = inputs.to(device)
168
-
169
- with torch.no_grad():
170
- embedding = model(inputs)
171
 
172
  print(f"Report: {report}")
173
  print(f"Embedding shape: {embedding.shape}")
@@ -195,8 +190,9 @@ def main():
195
 
196
  all_texts = [text] + comparison_options
197
 
198
- # Compute similarities
199
- _, similarities = compute_similarities(model, tokenizer, all_texts, device)
 
200
 
201
  print(f"Original text: {report}")
202
  print(f"Instruction: {instruction}")
@@ -224,13 +220,8 @@ def main():
224
  ]
225
 
226
  print("Computing embeddings for multiple reports...")
227
- inputs = tokenizer(reports, return_tensors="pt", padding=True, truncation=True, max_length=512)
228
- # Add embed_mask (same as attention_mask for simple text encoding)
229
- inputs["embed_mask"] = inputs["attention_mask"].clone()
230
- inputs = inputs.to(device)
231
-
232
- with torch.no_grad():
233
- embeddings = model(inputs)
234
 
235
  # Compute pairwise similarities
236
  similarity_matrix = F.cosine_similarity(
 
154
  model = model.to(device).to(torch.bfloat16)
155
  model.eval()
156
 
157
+ # Example 1: Basic text embedding using built-in method
158
  print("\n" + "="*60)
159
+ print("Example 1: Basic Text Embedding (Built-in Method)")
160
  print("="*60)
161
 
162
  report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
163
 
164
+ # Use the convenient built-in method
165
+ embedding = model.encode_text(report)
 
 
 
 
 
166
 
167
  print(f"Report: {report}")
168
  print(f"Embedding shape: {embedding.shape}")
 
190
 
191
  all_texts = [text] + comparison_options
192
 
193
+ # Use built-in method for instruction-based encoding
194
+ embeddings = model.encode_with_instruction(all_texts)
195
+ similarities = F.cosine_similarity(embeddings[0], embeddings[1:], dim=1)
196
 
197
  print(f"Original text: {report}")
198
  print(f"Instruction: {instruction}")
 
220
  ]
221
 
222
  print("Computing embeddings for multiple reports...")
223
+ # Use built-in method for multiple texts
224
+ embeddings = model.encode_text(reports)
 
 
 
 
 
225
 
226
  # Compute pairwise similarities
227
  similarity_matrix = F.cosine_similarity(