Ihor commited on
Commit
a34b1b7
·
verified ·
1 Parent(s): 6c87fee

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +132 -3
README.md CHANGED
@@ -1,3 +1,132 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ **How to use the model**
6
+
7
+ To use the model with `transformer` package, see the example below:
8
+ ```python
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer
10
+ import torch
11
+
12
+ model_name = "Ihor/OpenBioLLM-Text2Graph-8B"
13
+
14
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
15
+ tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|end_of_text|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
16
+
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ model_name,
19
+ device_map="auto",
20
+ torch_dtype=torch.bfloat16
21
+ )
22
+
23
+
24
+ MESSAGES = [
25
+ {
26
+ "role": "system",
27
+ "content": (
28
+ "You are an advanced assistant trained to process biomedical text for Named Entity Recognition (NER) and Relation Extraction (RE). "
29
+ "Your task is to analyze user-provided text, identify all unique and contextually relevant entities, and infer directed relationships "
30
+ "between these entities based on the context. Ensure that all relations exist only between annotated entities. "
31
+ "Entities and relationships should be human-readable and natural, reflecting real-world concepts and connections. "
32
+ "Output the annotated data in JSON format, structured as follows:\n\n"
33
+ """{"entities": [{"id": 0, "text": "ner_string_0", "type": "ner_type_string_0"}, {"id": 1, "text": "ner_string_1", "type": "ner_type_string_1"}], "relations": [{"head": 0, "tail": 1, "type": "re_type_string_0"}]}"""
34
+ "\n\nEnsure that the output captures all significant entities and their directed relationships in a clear and concise manner."
35
+ ),
36
+ },
37
+ {
38
+ "role": "user",
39
+ "content": (
40
+ 'Here is a text input: "Subjects will receive a 100mL dose of IV saline every 6 hours for 24 hours. The first dose will be administered prior to anesthesia induction, approximately 30 minutes before skin incision. A total of 4 doses will be given." '
41
+ "Analyze this text, select and classify the entities, and extract their relationships as per your instructions."
42
+ ),
43
+ },
44
+ ]
45
+
46
+ # Build prompt text
47
+ chat_prompt = tokenizer.apply_chat_template(
48
+ MESSAGES, tokenize=False, add_generation_prompt=True
49
+ )
50
+
51
+ # Tokenize
52
+ inputs = tokenizer(chat_prompt, return_tensors="pt").to(model.device)
53
+
54
+ # Generate
55
+ outputs = model.generate(
56
+ **inputs,
57
+ max_new_tokens=3000,
58
+ do_sample=True,
59
+ eos_token_id=tokenizer.eos_token_id,
60
+ pad_token_id=tokenizer.eos_token_id,
61
+ return_dict_in_generate=True
62
+ )
63
+
64
+ # Decode ONLY the new tokens (skip the prompt tokens)
65
+ prompt_len = inputs["input_ids"].shape[-1]
66
+ generated_ids = outputs.sequences[0][prompt_len:]
67
+ response = tokenizer.decode(generated_ids, skip_special_tokens=True)
68
+ print(response)
69
+ ```
70
+
71
+ To use the model with `vllm` package, please refer to the example below:
72
+ ```python
73
+ # !pip install vllm
74
+
75
+ from vllm import LLM, SamplingParams
76
+ from transformers import AutoTokenizer
77
+
78
+ MODEL_ID = "Ihor/OpenBioLLM-Text2Graph-8B"
79
+
80
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
81
+ tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|end_of_text|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
82
+
83
+ llm = LLM(model=MODEL_ID)
84
+
85
+ sampling_params = SamplingParams(
86
+ max_tokens=3000,
87
+ n=1,
88
+ best_of=1,
89
+ presence_penalty=0.0,
90
+ frequency_penalty=0.0,
91
+ repetition_penalty=1.0,
92
+ temperature=0.0,
93
+ top_p=1.0,
94
+ top_k=-1,
95
+ min_p=0.0,
96
+ seed=42,
97
+ )
98
+
99
+
100
+ MESSAGES = [
101
+ {
102
+ "role": "system",
103
+ "content": (
104
+ "You are an advanced assistant trained to process biomedical text for Named Entity Recognition (NER) and Relation Extraction (RE). "
105
+ "Your task is to analyze user-provided text, identify all unique and contextually relevant entities, and infer directed relationships "
106
+ "between these entities based on the context. Ensure that all relations exist only between annotated entities. "
107
+ "Entities and relationships should be human-readable and natural, reflecting real-world concepts and connections. "
108
+ "Output the annotated data in JSON format, structured as follows:\n\n"
109
+ """{"entities": [{"id": 0, "text": "ner_string_0", "type": "ner_type_string_0"}, {"id": 1, "text": "ner_string_1", "type": "ner_type_string_1"}], "relations": [{"head": 0, "tail": 1, "type": "re_type_string_0"}]}"""
110
+ "\n\nEnsure that the output captures all significant entities and their directed relationships in a clear and concise manner."
111
+ ),
112
+ },
113
+ {
114
+ "role": "user",
115
+ "content": (
116
+ 'Here is a text input: "Subjects will receive a 100mL dose of IV saline every 6 hours for 24 hours. The first dose will be administered prior to anesthesia induction, approximately 30 minutes before skin incision. A total of 4 doses will be given." '
117
+ "Analyze this text, select and classify the entities, and extract their relationships as per your instructions."
118
+ ),
119
+ },
120
+ ]
121
+
122
+ chat_prompt = tokenizer.apply_chat_template(
123
+ MESSAGES,
124
+ tokenize=False,
125
+ add_generation_prompt=True,
126
+ add_special_tokens=False,
127
+ )
128
+
129
+ outputs = llm.generate([chat_prompt], sampling_params)
130
+ response_text = outputs[0].outputs[0].text
131
+ print(response_text)
132
+ ```