yerim0210 commited on
Commit
cbe4c21
·
verified ·
1 Parent(s): bf1410e

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +95 -0
README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### ✨ [ACL 2025] Incorporating Domain Knowledge into Materials Tokenization
2
+
3
+ You can find this paper in [here](https://arxiv.org/abs/2506.11115)
4
+
5
+
6
+ If you only want to extract material concepts (material term or material formula) using **MatDetector**, please follow the steps below.
7
+
8
+ ```python
9
+ import os
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from transformers import AutoTokenizer, BertForTokenClassification
13
+ from tqdm import tqdm
14
+
15
+ model_path = MatDetector_ckp
16
+ # you can download matbert at https://github.com/lbnlp/MatBERT
17
+ tokenizer_path = '/matbert-base-cased'
18
+ input_file = 'TARGET.txt'
19
+ output_directory = './'
20
+
21
+
22
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, do_lower_case=False)
23
+ model = BertForTokenClassification.from_pretrained(model_path).half()
24
+
25
+ device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
26
+ model.to(device)
27
+ model.eval()
28
+
29
+ label_map = {0: "O", 1: "B-matname", 2: "I-matname", 3: "B-mf", 4: "I-mf"}
30
+
31
+ def process_single_word(word, tokenizer, model, device):
32
+ tokenized = tokenizer(word, return_tensors="pt", truncation=True, max_length=128)
33
+ input_ids = tokenized["input_ids"].to(device)
34
+ attention_mask = tokenized["attention_mask"].to(device)
35
+
36
+ with torch.no_grad():
37
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
38
+ logits = outputs.logits
39
+ probabilities = F.softmax(logits, dim=2) # (batch_size=1, seq_len, num_labels)
40
+
41
+ return tokenized, probabilities
42
+
43
+
44
+ def determine_label(tokenized, probabilities, label_map):
45
+ tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0].tolist())
46
+ probs = probabilities[0] # (seq_len, num_labels)
47
+
48
+ token_labels = []
49
+ for token, prob in zip(tokens, probs):
50
+ if token in ["[CLS]", "[SEP]", "[PAD]"]:
51
+ continue
52
+
53
+ clean_token = token[2:] if token.startswith("##") else token
54
+ max_label = prob.argmax().item()
55
+ label_name = label_map[max_label]
56
+
57
+ token_labels.append(label_name)
58
+
59
+ label_counts = {}
60
+ for label in token_labels:
61
+ if label not in label_counts:
62
+ label_counts[label] = 0
63
+ label_counts[label] += 1
64
+
65
+ final_label = max(label_counts, key=label_counts.get) if label_counts else "O"
66
+
67
+ return final_label
68
+
69
+
70
+ with open(os.path.join(output_directory, "mf.txt"), "w") as mf_file, \
71
+ open(os.path.join(output_directory, "matname.txt"), "w") as matname_file, \
72
+ open(os.path.join(output_directory, "o_tags.txt"), "w") as o_file:
73
+
74
+ with open(input_file, 'r') as file:
75
+ lines = [line.strip() for line in file.readlines() if line.strip()]
76
+ total_lines = len(lines)
77
+
78
+ with tqdm(total=total_lines, desc="Processing words", unit="words") as progress_bar:
79
+ for original_word in lines:
80
+ tokenized, probabilities = process_single_word(original_word, tokenizer, model, device)
81
+ final_label = determine_label(tokenized, probabilities, label_map)
82
+
83
+ if final_label == "O":
84
+ o_file.write(f"{original_word}\n")
85
+ elif final_label in ["B-mf", "I-mf"]:
86
+ mf_file.write(f"{original_word}\n")
87
+ elif final_label in ["B-matname", "I-matname"]:
88
+ matname_file.write(f"{original_word}\n")
89
+
90
+ progress_bar.update(1)
91
+
92
+ print("Processing completed. Files saved as mf.txt, matname.txt, and o_tags.txt.")
93
+
94
+ ```
95
+