Upload LLM2Vec4CXR fine-tuned model

Browse files

Files changed (2) hide show

README.md +34 -60
usage_example.py +9 -18

README.md CHANGED Viewed

@@ -62,8 +62,6 @@ pip install -e .
 ### Basic Usage
 ```python
-import torch
-import torch.nn.functional as F
 from llm2vec_wrapper import LLM2VecWrapper as LLM2Vec
 # Load the model
@@ -75,74 +73,50 @@ model = LLM2Vec.from_pretrained(
     torch_dtype=torch.bfloat16,
 )
-# Configure tokenizer
-tokenizer = model.tokenizer
-tokenizer.padding_side = 'left'
-# Example usage for chest X-ray report analysis
-def encode_text(text):
-    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
-    # IMPORTANT: Add embed_mask for proper model functioning
-    # For simple text encoding, embed_mask is the same as attention_mask
-    inputs["embed_mask"] = inputs["attention_mask"].clone()
-    with torch.no_grad():
-        embeddings = model(inputs)
-    return embeddings
-# Example with medical text
-report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
-embedding = encode_text(report)
 ```
-**Note**: The model requires an `embed_mask` input. For simple text encoding, set `embed_mask` equal to `attention_mask`. For instruction-following tasks, use the separator-based tokenization shown below.
-### Advanced Usage with Separator-based Processing
-The model supports special separator-based processing for instruction-following tasks:
 ```python
-def tokenize_with_separator(texts, tokenizer, max_length):
-    """Tokenize texts with special handling for separator-based splitting."""
-    texts_2 = []
-    original_texts = []
-    separator = '!@#$%^&*()'
-    for text in texts:
-        parts = text.split(separator)
-        texts_2.append(parts[1] if len(parts) > 1 else "")
-        original_texts.append("".join(parts))
-    tokenized = tokenizer(
-        original_texts,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=max_length,
-    )
-    # Create embedding masks for the separated parts
-    embed_mask = None
-    for t_i, t in enumerate(texts_2):
-        ids = tokenizer([t], return_tensors="pt", padding=True, truncation=True,
-                       max_length=max_length, add_special_tokens=False)
-        e_m = torch.zeros_like(tokenized["attention_mask"][t_i])
-        if len(ids["input_ids"][0]) > 0:
-            e_m[-len(ids["input_ids"][0]):] = torch.ones(len(ids["input_ids"][0]))
-        if embed_mask is None:
-            embed_mask = e_m.unsqueeze(0)
-        else:
-            embed_mask = torch.cat((embed_mask, e_m.unsqueeze(0)), dim=0)
-    tokenized["embed_mask"] = embed_mask
-    return tokenized
-# Example with instruction and report
-separator = '!@#$%^&*()'
-instruction = 'Determine the change or the status of the pleural effusion.'
-report = 'There is a small increase in the left-sided effusion.'
-text = instruction + separator + report
-tokenized = tokenize_with_separator([text], tokenizer, 512)
 embedding = model(tokenized)
 ```

 ### Basic Usage
 ```python
 from llm2vec_wrapper import LLM2VecWrapper as LLM2Vec
 # Load the model
     torch_dtype=torch.bfloat16,
 )
+# Simple text encoding (built-in method)
+report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
+embedding = model.encode_text(report)
+# Multiple texts at once
+reports = [
+    "No acute cardiopulmonary abnormality.",
+    "Small bilateral pleural effusions.",
+    "Large left pleural effusion with compressive atelectasis."
+]
+embeddings = model.encode_text(reports)
+```
+### Advanced Usage with Instructions
+```python
+# For instruction-following tasks with separator
+separator = '!@#$%^&*()'
+instruction = 'Determine the change or the status of the pleural effusion.'
+report = 'There is a small increase in the left-sided effusion.'
+text_with_instruction = instruction + separator + report
+# Use the built-in method for instruction-based encoding
+embedding = model.encode_with_instruction([text_with_instruction])
 ```
+**Note**: The model now includes convenient `encode_text()` and `encode_with_instruction()` methods that handle the `embed_mask` automatically.
+### Manual Usage (if you need more control)
+If you need more control over the tokenization process, you can still use the manual approach:
 ```python
+# Manual tokenization with embed_mask
+def encode_text_manual(model, text):
+    inputs = model.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    inputs["embed_mask"] = inputs["attention_mask"].clone()  # Required for proper functioning
+    with torch.no_grad():
+        embeddings = model(inputs)
+    return embeddings
+# For instruction-based tasks, use the built-in tokenize_with_separator method
+tokenized = model.tokenize_with_separator([text_with_instruction])
 embedding = model(tokenized)
 ```

usage_example.py CHANGED Viewed

@@ -154,20 +154,15 @@ def main():
     model = model.to(device).to(torch.bfloat16)
     model.eval()
-    # Example 1: Basic text embedding
     print("\n" + "="*60)
-    print("Example 1: Basic Text Embedding")
     print("="*60)
     report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
-    inputs = tokenizer(report, return_tensors="pt", padding=True, truncation=True, max_length=512)
-    # Add embed_mask (same as attention_mask for simple text encoding)
-    inputs["embed_mask"] = inputs["attention_mask"].clone()
-    inputs = inputs.to(device)
-    with torch.no_grad():
-        embedding = model(inputs)
     print(f"Report: {report}")
     print(f"Embedding shape: {embedding.shape}")
@@ -195,8 +190,9 @@ def main():
     all_texts = [text] + comparison_options
-    # Compute similarities
-    _, similarities = compute_similarities(model, tokenizer, all_texts, device)
     print(f"Original text: {report}")
     print(f"Instruction: {instruction}")
@@ -224,13 +220,8 @@ def main():
     ]
     print("Computing embeddings for multiple reports...")
-    inputs = tokenizer(reports, return_tensors="pt", padding=True, truncation=True, max_length=512)
-    # Add embed_mask (same as attention_mask for simple text encoding)
-    inputs["embed_mask"] = inputs["attention_mask"].clone()
-    inputs = inputs.to(device)
-    with torch.no_grad():
-        embeddings = model(inputs)
     # Compute pairwise similarities
     similarity_matrix = F.cosine_similarity(

     model = model.to(device).to(torch.bfloat16)
     model.eval()
+    # Example 1: Basic text embedding using built-in method
     print("\n" + "="*60)
+    print("Example 1: Basic Text Embedding (Built-in Method)")
     print("="*60)
     report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."
+    # Use the convenient built-in method
+    embedding = model.encode_text(report)
     print(f"Report: {report}")
     print(f"Embedding shape: {embedding.shape}")
     all_texts = [text] + comparison_options
+    # Use built-in method for instruction-based encoding
+    embeddings = model.encode_with_instruction(all_texts)
+    similarities = F.cosine_similarity(embeddings[0], embeddings[1:], dim=1)
     print(f"Original text: {report}")
     print(f"Instruction: {instruction}")
     ]
     print("Computing embeddings for multiple reports...")
+    # Use built-in method for multiple texts
+    embeddings = model.encode_text(reports)
     # Compute pairwise similarities
     similarity_matrix = F.cosine_similarity(