Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### ✨ [ACL 2025] Incorporating Domain Knowledge into Materials Tokenization
|
2 |
+
|
3 |
+
You can find this paper in [here](https://arxiv.org/abs/2506.11115)
|
4 |
+
|
5 |
+
|
6 |
+
If you only want to extract material concepts (material term or material formula) using **MatDetector**, please follow the steps below.
|
7 |
+
|
8 |
+
```python
|
9 |
+
import os
|
10 |
+
import torch
|
11 |
+
import torch.nn.functional as F
|
12 |
+
from transformers import AutoTokenizer, BertForTokenClassification
|
13 |
+
from tqdm import tqdm
|
14 |
+
|
15 |
+
model_path = MatDetector_ckp
|
16 |
+
# you can download matbert at https://github.com/lbnlp/MatBERT
|
17 |
+
tokenizer_path = '/matbert-base-cased'
|
18 |
+
input_file = 'TARGET.txt'
|
19 |
+
output_directory = './'
|
20 |
+
|
21 |
+
|
22 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, do_lower_case=False)
|
23 |
+
model = BertForTokenClassification.from_pretrained(model_path).half()
|
24 |
+
|
25 |
+
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
|
26 |
+
model.to(device)
|
27 |
+
model.eval()
|
28 |
+
|
29 |
+
label_map = {0: "O", 1: "B-matname", 2: "I-matname", 3: "B-mf", 4: "I-mf"}
|
30 |
+
|
31 |
+
def process_single_word(word, tokenizer, model, device):
|
32 |
+
tokenized = tokenizer(word, return_tensors="pt", truncation=True, max_length=128)
|
33 |
+
input_ids = tokenized["input_ids"].to(device)
|
34 |
+
attention_mask = tokenized["attention_mask"].to(device)
|
35 |
+
|
36 |
+
with torch.no_grad():
|
37 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
38 |
+
logits = outputs.logits
|
39 |
+
probabilities = F.softmax(logits, dim=2) # (batch_size=1, seq_len, num_labels)
|
40 |
+
|
41 |
+
return tokenized, probabilities
|
42 |
+
|
43 |
+
|
44 |
+
def determine_label(tokenized, probabilities, label_map):
|
45 |
+
tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0].tolist())
|
46 |
+
probs = probabilities[0] # (seq_len, num_labels)
|
47 |
+
|
48 |
+
token_labels = []
|
49 |
+
for token, prob in zip(tokens, probs):
|
50 |
+
if token in ["[CLS]", "[SEP]", "[PAD]"]:
|
51 |
+
continue
|
52 |
+
|
53 |
+
clean_token = token[2:] if token.startswith("##") else token
|
54 |
+
max_label = prob.argmax().item()
|
55 |
+
label_name = label_map[max_label]
|
56 |
+
|
57 |
+
token_labels.append(label_name)
|
58 |
+
|
59 |
+
label_counts = {}
|
60 |
+
for label in token_labels:
|
61 |
+
if label not in label_counts:
|
62 |
+
label_counts[label] = 0
|
63 |
+
label_counts[label] += 1
|
64 |
+
|
65 |
+
final_label = max(label_counts, key=label_counts.get) if label_counts else "O"
|
66 |
+
|
67 |
+
return final_label
|
68 |
+
|
69 |
+
|
70 |
+
with open(os.path.join(output_directory, "mf.txt"), "w") as mf_file, \
|
71 |
+
open(os.path.join(output_directory, "matname.txt"), "w") as matname_file, \
|
72 |
+
open(os.path.join(output_directory, "o_tags.txt"), "w") as o_file:
|
73 |
+
|
74 |
+
with open(input_file, 'r') as file:
|
75 |
+
lines = [line.strip() for line in file.readlines() if line.strip()]
|
76 |
+
total_lines = len(lines)
|
77 |
+
|
78 |
+
with tqdm(total=total_lines, desc="Processing words", unit="words") as progress_bar:
|
79 |
+
for original_word in lines:
|
80 |
+
tokenized, probabilities = process_single_word(original_word, tokenizer, model, device)
|
81 |
+
final_label = determine_label(tokenized, probabilities, label_map)
|
82 |
+
|
83 |
+
if final_label == "O":
|
84 |
+
o_file.write(f"{original_word}\n")
|
85 |
+
elif final_label in ["B-mf", "I-mf"]:
|
86 |
+
mf_file.write(f"{original_word}\n")
|
87 |
+
elif final_label in ["B-matname", "I-matname"]:
|
88 |
+
matname_file.write(f"{original_word}\n")
|
89 |
+
|
90 |
+
progress_bar.update(1)
|
91 |
+
|
92 |
+
print("Processing completed. Files saved as mf.txt, matname.txt, and o_tags.txt.")
|
93 |
+
|
94 |
+
```
|
95 |
+
|