yerim0210
/

MatDetector

Feature Extraction

Model card Files Files and versions

MatDetector / README.md

yerim0210's picture

Update README.md

79fb221 verified about 2 months ago

|

history blame contribute delete

3.53 kB

	---
	base_model:
	- alan-yahya/MatBERT
	pipeline_tag: feature-extraction
	tags:
	- chemistry
	- biology
	---
	### ✨ [ACL 2025] Incorporating Domain Knowledge into Materials Tokenization

	You can find this paper in [here](https://arxiv.org/abs/2506.11115)


	If you only want to extract material concepts (material term or material formula) using MatDetector, please follow the steps below.

	```python
	import os
	import torch
	import torch.nn.functional as F
	from transformers import AutoTokenizer, BertForTokenClassification
	from tqdm import tqdm

	model_path = MatDetector_ckp
	# you can download matbert at https://github.com/lbnlp/MatBERT
	tokenizer_path = '/matbert-base-cased'
	input_file = 'TARGET.txt'
	output_directory = './'


	tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, do_lower_case=False)
	model = BertForTokenClassification.from_pretrained(model_path).half()

	device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
	model.to(device)
	model.eval()

	label_map = {0: "O", 1: "B-matname", 2: "I-matname", 3: "B-mf", 4: "I-mf"}

	def process_single_word(word, tokenizer, model, device):
	tokenized = tokenizer(word, return_tensors="pt", truncation=True, max_length=128)
	input_ids = tokenized["input_ids"].to(device)
	attention_mask = tokenized["attention_mask"].to(device)

	with torch.no_grad():
	outputs = model(input_ids=input_ids, attention_mask=attention_mask)
	logits = outputs.logits
	probabilities = F.softmax(logits, dim=2) # (batch_size=1, seq_len, num_labels)

	return tokenized, probabilities


	def determine_label(tokenized, probabilities, label_map):
	tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0].tolist())
	probs = probabilities[0] # (seq_len, num_labels)

	token_labels = []
	for token, prob in zip(tokens, probs):
	if token in ["[CLS]", "[SEP]", "[PAD]"]:
	continue

	clean_token = token[2:] if token.startswith("##") else token
	max_label = prob.argmax().item()
	label_name = label_map[max_label]

	token_labels.append(label_name)

	label_counts = {}
	for label in token_labels:
	if label not in label_counts:
	label_counts[label] = 0
	label_counts[label] += 1

	final_label = max(label_counts, key=label_counts.get) if label_counts else "O"

	return final_label


	with open(os.path.join(output_directory, "mf.txt"), "w") as mf_file, \
	open(os.path.join(output_directory, "matname.txt"), "w") as matname_file, \
	open(os.path.join(output_directory, "o_tags.txt"), "w") as o_file:

	with open(input_file, 'r') as file:
	lines = [line.strip() for line in file.readlines() if line.strip()]
	total_lines = len(lines)

	with tqdm(total=total_lines, desc="Processing words", unit="words") as progress_bar:
	for original_word in lines:
	tokenized, probabilities = process_single_word(original_word, tokenizer, model, device)
	final_label = determine_label(tokenized, probabilities, label_map)

	if final_label == "O":
	o_file.write(f"{original_word}\n")
	elif final_label in ["B-mf", "I-mf"]:
	mf_file.write(f"{original_word}\n")
	elif final_label in ["B-matname", "I-matname"]:
	matname_file.write(f"{original_word}\n")

	progress_bar.update(1)

	print("Processing completed. Files saved as mf.txt, matname.txt, and o_tags.txt.")

	```