Spaces:

Trinity2105
/

ung-dung-them-dau-tieng-viet

Sleeping

App Files Files Community

ung-dung-them-dau-tieng-viet / app.py

Trinity2105

Update app.py

34fd336 verified 4 months ago

raw

history blame contribute delete

3.98 kB

	import gradio as gr
	import pickle
	import os
	import re

	# Đường dẫn đến thư mục và file mô hình
	model_dir = r"D:\Visual Studio Code\Data\Lab8\ung-dung-them-dau-tieng-viet"
	model_path = "kneserney_ngram_model.pkl"
	syllables_path = "vn_syllables.txt"

	# Hàm xử lý
	def remove_vn_accent(word):
	"""
	Hàm loại bỏ dấu tiếng Việt.
	"""
	word = re.sub('[áàảãạăắằẳẵặâấầẩẫậ]', 'a', word)
	word = re.sub('[éèẻẽẹêếềểễệ]', 'e', word)
	word = re.sub('[óòỏõọôốồổỗộơớờởỡợ]', 'o', word)
	word = re.sub('[íìỉĩị]', 'i', word)
	word = re.sub('[úùủũụưứừửữự]', 'u', word)
	word = re.sub('[ýỳỷỹỵ]', 'y', word)
	word = re.sub('đ', 'd', word)
	return word

	def load_vn_syllables():
	"""
	Hàm tải các từ tiếng Việt và ánh xạ từ không dấu sang có dấu.
	"""
	try:
	with open(syllables_path, encoding="utf8") as f:
	syllables = f.read().splitlines()
	syllable_map = {}
	for w in syllables:
	w_no_accent = remove_vn_accent(w.lower())
	if w_no_accent not in syllable_map:
	syllable_map[w_no_accent] = {w}
	else:
	syllable_map[w_no_accent].add(w)
	return syllable_map
	except FileNotFoundError:
	print("Không tìm thấy file vn_syllables.txt")
	return {}

	syllable_map = load_vn_syllables()

	def gen_accents_word(word):
	"""
	Hàm tạo các từ có dấu dựa trên từ không dấu.
	"""
	word_no_accent = remove_vn_accent(word.lower())
	return syllable_map.get(word_no_accent, {word})

	def beam_search(words, model, k=3):
	"""
	Hàm beam search để tìm kiếm từ tốt nhất dựa trên ngữ cảnh.
	"""
	sequences = []
	for idx, word in enumerate(words):
	if idx == 0:
	sequences = [([x], 0.0) for x in gen_accents_word(word)]
	else:
	all_sequences = []
	for seq in sequences:
	for next_word in gen_accents_word(word):
	current_word = seq[0][-1]
	try:
	previous_word = seq[0][-2]
	score = model.logscore(next_word, [previous_word, current_word])
	except IndexError:
	score = model.logscore(next_word, [current_word])
	new_seq = seq[0].copy()
	new_seq.append(next_word)
	all_sequences.append((new_seq, seq[1] + score))
	all_sequences = sorted(all_sequences, key=lambda x: x[1], reverse=True)
	sequences = all_sequences[:k]
	return sequences

	# Tải mô hình
	model_loaded = None
	try:
	with open(model_path, 'rb') as fin:
	model_loaded = pickle.load(fin)
	except FileNotFoundError:
	print("Không tìm thấy file mô hình.")
	except pickle.UnpicklingError:
	print("Lỗi khi tải mô hình.")

	def them_dau(text):
	if not text.strip():
	return "" # Trả về chuỗi trống nếu đầu vào là chuỗi trống
	words = text.split()
	best_sequences = beam_search(words, model_loaded, k=1)
	best_sentence = ' '.join(best_sequences[0][0])
	return best_sentence

	iface = gr.Interface(
	fn=them_dau,
	inputs=gr.Textbox(lines=2, placeholder="Nhập câu tiếng Việt không dấu..."),
	outputs="text",
	title="Ứng dụng thêm dấu tiếng Việt",
	description="Nhập câu không dấu và nhận kết quả đã thêm dấu!",
	)

	iface.launch(share=True) # Chỉ gọi launch() một lần ở đây

	# In thông báo sau khi launch() để đảm bảo ứng dụng đã khởi động
	if not model_loaded:
	print("CẢNH BÁO: Không thể tải mô hình. Ứng dụng sẽ chạy nhưng không thể thêm dấu.")
	if not syllable_map:
	print("CẢNH BÁO: Không thể tải file từ điển.")