Spaces:

ThongDang2714
/

Chatbot_Law

Sleeping

App Files Files Community

Chatbot_Law / app.py

minhdang14902

Update app.py

77fe43b verified over 1 year ago

raw

history blame contribute delete

17.8 kB

	import streamlit as st
	import torch
	import pytorch_lightning as pl
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, T5Tokenizer, T5ForConditionalGeneration
	import nltk
	from transformers.models.roberta.modeling_roberta import *
	from transformers import RobertaForQuestionAnswering
	from nltk import word_tokenize
	import json
	import pandas as pd
	# import re
	# import base64
	# Set the background image
	# background_image = """
	# <style>
	# [data-testid="stAppViewContainer"] > .main {
	# background-image: url("https://images.unsplash.com/photo-1542281286-9e0a16bb7366");
	# background-size: 100vw 100vh; # This sets the size to cover 100% of the viewport width and height
	# background-position: center;
	# background-repeat: no-repeat;
	# }
	# </style>
	# """
	# st.markdown(background_image, unsafe_allow_html=True)

	# def set_bg_hack(main_bg):
	# '''
	# A function to unpack an image from root folder and set as bg.

	# Returns
	# -------
	# The background.
	# '''
	# # set bg name
	# main_bg_ext = "png"

	# st.markdown(
	# f"""
	# <style>
	# .stApp {{
	# background: url(data:image/{main_bg_ext};base64,{base64.b64encode(open(main_bg, "rb").read()).decode()});
	# background-size: cover
	# }}
	# </style>
	# """,
	# unsafe_allow_html=True
	# )
	# set_bg_hack("Background.png")

	# image_url = "logo1.png"

	# # Hiển thị hình ảnh mà không có caption và điều chỉnh kích thước nhỏ lại
	# st.image(image_url, width=100)




	# Download punkt for nltk
	print("===================================================================")
	@st.cache_data
	def download_nltk_punkt():
	nltk.download('punkt_tab')

	# Cache loading PhoBert model and tokenizer
	@st.cache_resource
	def load_phoBert():
	model = AutoModelForSequenceClassification.from_pretrained('minhdang14902/Phobert_Law')
	tokenizer = AutoTokenizer.from_pretrained('minhdang14902/Phobert_Law')
	return model, tokenizer



	# Call the cached functions
	download_nltk_punkt()
	phoBert_model, phoBert_tokenizer = load_phoBert()

	# Initialize the pipeline with the loaded PhoBert model and tokenizer
	chatbot_pipeline = pipeline("sentiment-analysis", model=phoBert_model, tokenizer=phoBert_tokenizer)

	# Load spaCy Vietnamese model
	# nlp = spacy.load('vi_core_news_lg')

	# Load intents from json file
	def load_json_file(filename):
	with open(filename) as f:
	file = json.load(f)
	return file

	filename = './Law_2907.json'
	intents = load_json_file(filename)

	@st.cache_data
	def create_df():
	df = pd.DataFrame({
	'Pattern': [],
	'Tag': []
	})
	return df

	df = create_df()

	@st.cache_data
	def extract_json_info(json_file, df):
	for intent in json_file['intents']:
	for pattern in intent['patterns']:
	sentence_tag = [pattern, intent['tag']]
	df.loc[len(df.index)] = sentence_tag
	return df

	df = extract_json_info(intents, df)
	df2 = df.copy()

	labels = df2['Tag'].unique().tolist()
	labels = [s.strip() for s in labels]
	num_labels = len(labels)
	id2label = {id: label for id, label in enumerate(labels)}
	label2id = {label: id for id, label in enumerate(labels)}

	# def tokenize_with_spacy(text):
	# doc = nlp(text)
	# tokens = [token.text for token in doc]
	# tokenized_text = ' '.join(tokens)
	# tokenized_text = re.sub(r'(?<!\s)([.,?])', r' \1', tokenized_text)
	# tokenized_text = re.sub(r'([.,?])(?!\s)', r'\1 ', tokenized_text)
	# return tokenized_text

	# Load Roberta model and tokenizer

	_CHECKPOINT_FOR_DOC = "roberta-base"
	_CONFIG_FOR_DOC = "RobertaConfig"
	_TOKENIZER_FOR_DOC = "RobertaTokenizer"


	class MRCQuestionAnswering(RobertaPreTrainedModel):
	config_class = RobertaConfig

	def _reorder_cache(self, past, beam_idx):
	pass

	_keys_to_ignore_on_load_unexpected = [r"pooler"]
	_keys_to_ignore_on_load_missing = [r"position_ids"]

	def __init__(self, config):
	super().__init__(config)
	self.num_labels = config.num_labels

	self.roberta = RobertaModel(config, add_pooling_layer=False)
	self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

	self.init_weights()

	def forward(
	self,
	input_ids=None,
	words_lengths=None,
	start_idx=None,
	end_idx=None,
	attention_mask=None,
	token_type_ids=None,
	position_ids=None,
	head_mask=None,
	inputs_embeds=None,
	start_positions=None,
	end_positions=None,
	span_answer_ids=None,
	output_attentions=None,
	output_hidden_states=None,
	return_dict=None,
	):
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	outputs = self.roberta(
	input_ids,
	attention_mask=attention_mask,
	token_type_ids=None, # Roberta doesn't use token_type_ids
	position_ids=position_ids,
	head_mask=head_mask,
	inputs_embeds=inputs_embeds,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	sequence_output = outputs[0]

	context_embedding = sequence_output

	batch_size = input_ids.shape[0]
	max_sub_word = input_ids.shape[1]
	max_word = words_lengths.shape[1]
	align_matrix = torch.zeros((batch_size, max_word, max_sub_word))

	for i, sample_length in enumerate(words_lengths):
	for j in range(len(sample_length)):
	start_idx = torch.sum(sample_length[:j])
	align_matrix[i][j][start_idx: start_idx + sample_length[j]] = 1 if sample_length[j] > 0 else 0

	align_matrix = align_matrix.to(context_embedding.device)
	context_embedding_align = torch.bmm(align_matrix, context_embedding)

	logits = self.qa_outputs(context_embedding_align)
	start_logits, end_logits = logits.split(1, dim=-1)
	start_logits = start_logits.squeeze(-1).contiguous()
	end_logits = end_logits.squeeze(-1).contiguous()

	total_loss = None
	if start_positions is not None and end_positions is not None:
	if len(start_positions.size()) > 1:
	start_positions = start_positions.squeeze(-1)
	if len(end_positions.size()) > 1:
	end_positions = end_positions.squeeze(-1)
	ignored_index = start_logits.size(1)
	start_positions = start_positions.clamp(0, ignored_index)
	end_positions = end_positions.clamp(0, ignored_index)

	loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
	start_loss = loss_fct(start_logits, start_positions)
	end_loss = loss_fct(end_logits, end_positions)
	total_loss = (start_loss + end_loss) / 2

	if not return_dict:
	output = (start_logits, end_logits) + outputs[2:]
	return ((total_loss,) + output) if total_loss is not None else output

	return QuestionAnsweringModelOutput(
	loss=total_loss,
	start_logits=start_logits,
	end_logits=end_logits,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)

	# roberta_model_checkpoint = "minhdang14902/Roberta_edu"
	# roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_checkpoint)
	# roberta_model = MRCQuestionAnswering.from_pretrained(roberta_model_checkpoint)

	# Cache loading Roberta model and tokenizer
	@st.cache_resource
	def load_roberta_model():
	model = MRCQuestionAnswering.from_pretrained('minhdang14902/Roberta_Law')
	tokenizer = AutoTokenizer.from_pretrained('minhdang14902/Roberta_Law')
	return model, tokenizer

	roberta_model, roberta_tokenizer = load_roberta_model()


	def chatRoberta(text):
	label = label2id[chatbot_pipeline(text)[0]['label']]
	response = intents['intents'][label]['responses']
	print(response[0])

	QA_input = {
	'question': text,
	'context': response[0]
	}

	# Tokenize input
	encoded_input = tokenize_function(QA_input, roberta_tokenizer)

	# Prepare batch samples
	batch_samples = data_collator([encoded_input], roberta_tokenizer)

	# Model prediction
	roberta_model.eval()
	with torch.no_grad():
	inputs = {
	'input_ids': batch_samples['input_ids'],
	'attention_mask': batch_samples['attention_mask'],
	'words_lengths': batch_samples['words_lengths'],
	}
	outputs = roberta_model(**inputs)

	# Extract answer
	result = extract_answer([encoded_input], outputs, roberta_tokenizer)
	context = response[0]
	return result, context

	def tokenize_function(example, tokenizer):
	question_word = word_tokenize(example["question"])
	context_word = word_tokenize(example["context"])

	question_sub_words_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(w)) for w in question_word]
	context_sub_words_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(w)) for w in context_word]
	valid = True
	if len([j for i in question_sub_words_ids + context_sub_words_ids for j in i]) > tokenizer.model_max_length - 1:
	valid = False

	question_sub_words_ids = [[tokenizer.bos_token_id]] + question_sub_words_ids + [[tokenizer.eos_token_id]]
	context_sub_words_ids = context_sub_words_ids + [[tokenizer.eos_token_id]]

	input_ids = [j for i in question_sub_words_ids + context_sub_words_ids for j in i]
	if len(input_ids) > tokenizer.model_max_length:
	valid = False

	words_lengths = [len(item) for item in question_sub_words_ids + context_sub_words_ids]

	return {
	"input_ids": input_ids,
	"words_lengths": words_lengths,
	"valid": valid
	}

	def data_collator(samples, tokenizer):
	if len(samples) == 0:
	return {}

	def collate_tokens(values, pad_idx, eos_idx=None, left_pad=False, move_eos_to_beginning=False):
	size = max(v.size(0) for v in values)
	res = values[0].new(len(values), size).fill_(pad_idx)

	def copy_tensor(src, dst):
	assert dst.numel() == src.numel()
	if move_eos_to_beginning:
	assert src[-1] == eos_idx
	dst[0] = eos_idx
	dst[1:] = src[:-1]
	else:
	dst.copy_(src)

	for i, v in enumerate(values):
	copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)])
	return res

	input_ids = collate_tokens([torch.tensor(item['input_ids']) for item in samples], pad_idx=tokenizer.pad_token_id)
	attention_mask = torch.zeros_like(input_ids)
	for i in range(len(samples)):
	attention_mask[i][:len(samples[i]['input_ids'])] = 1
	words_lengths = collate_tokens([torch.tensor(item['words_lengths']) for item in samples], pad_idx=0)

	batch_samples = {
	'input_ids': input_ids,
	'attention_mask': attention_mask,
	'words_lengths': words_lengths,
	}

	return batch_samples

	def extract_answer(inputs, outputs, tokenizer):
	plain_result = []
	for sample_input, start_logit, end_logit in zip(inputs, outputs.start_logits, outputs.end_logits):
	sample_words_length = sample_input['words_lengths']
	input_ids = sample_input['input_ids']
	answer_start = sum(sample_words_length[:torch.argmax(start_logit)])
	answer_end = sum(sample_words_length[:torch.argmax(end_logit) + 1])

	if answer_start <= answer_end:
	answer = tokenizer.convert_tokens_to_string(
	tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
	if answer == tokenizer.bos_token:
	answer = ''
	else:
	answer = ''

	score_start = torch.max(torch.softmax(start_logit, dim=-1)).cpu().detach().numpy().tolist()
	score_end = torch.max(torch.softmax(end_logit, dim=-1)).cpu().detach().numpy().tolist()
	plain_result.append({
	"answer": answer,
	"score_start": score_start,
	"score_end": score_end
	})
	return plain_result

	#T555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555
	DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	INPUT_MAX_LEN = 128 # Adjusted input length
	OUTPUT_MAX_LEN = 256 # Adjusted output length

	MODEL_NAME = "VietAI/vit5-base"



	tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, model_max_length=INPUT_MAX_LEN)

	class T5Model(pl.LightningModule):
	def __init__(self):
	super().__init__()
	self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)

	def forward(self, input_ids, attention_mask, labels=None):
	output = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	labels=labels
	)
	return output.loss, output.logits

	def training_step(self, batch, batch_idx):
	input_ids = batch["input_ids"].to(DEVICE)
	attention_mask = batch["attention_mask"].to(DEVICE)
	labels = batch["target"].to(DEVICE)
	loss, logits = self(input_ids, attention_mask, labels)
	self.log("train_loss", loss, prog_bar=True, logger=True)
	return {'loss': loss}

	def validation_step(self, batch, batch_idx):
	input_ids = batch["input_ids"].to(DEVICE)
	attention_mask = batch["attention_mask"].to(DEVICE)
	labels = batch["target"].to(DEVICE)
	loss, logits = self(input_ids, attention_mask, labels)
	self.log("val_loss", loss, prog_bar=True, logger=True)
	return {'val_loss': loss}

	def configure_optimizers(self):
	return AdamW(self.parameters(), lr=0.0001)



	train_model = T5Model.load_from_checkpoint('./data-law/law-model-v1.ckpt')
	train_model.freeze()



	def generate_question(question):
	print("tokenizer")
	inputs_encoding = tokenizer(
	question,
	add_special_tokens=True,
	max_length=INPUT_MAX_LEN,
	padding='max_length',
	truncation='only_first',
	return_attention_mask=True,
	return_tensors="pt"
	).to(DEVICE)

	print("generate id")
	generate_ids = train_model.model.generate(
	input_ids=inputs_encoding["input_ids"],
	attention_mask=inputs_encoding["attention_mask"],
	max_length=INPUT_MAX_LEN,
	num_beams=4,
	num_return_sequences=1,
	no_repeat_ngram_size=2,
	early_stopping=True,
	)

	print("decode")
	preds = [
	tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
	for gen_id in generate_ids
	]

	response = " ".join(preds[0].split())
	print('T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5T5')
	return response


	def get_response(text):
	# Thay thế hàm này bằng model của bạn để lấy câu trả lời từ bot
	# st.subheader("The Answer is:")
	# st.write(text)
	answer, context = chatRoberta(text)
	result = answer[0]['answer']
	if result == "":
	print("Khởi chạy T5")
	return generate_question(text)
	return result

	# st.title("General Law Chatbot")

	# # Khởi tạo lịch sử tin nhắn
	# if "messages" not in st.session_state:
	# st.session_state.messages = []

	# # Hiển thị các tin nhắn từ lịch sử
	# for message in st.session_state.messages:
	# with st.chat_message(message["role"]):
	# st.markdown(message["content"])

	# # Nhận input từ người dùng
	# if prompt := st.chat_input("What is up?"):
	# # Thêm tin nhắn của người dùng vào lịch sử
	# st.session_state.messages.append({"role": "user", "content": prompt})

	# # Hiển thị tin nhắn của người dùng trong giao diện
	# with st.chat_message("user"):
	# st.markdown(prompt)

	# # Lấy câu trả lời từ bot
	# response = get_response(prompt)

	# # Hiển thị câu trả lời của bot trong giao diện
	# with st.chat_message("assistant"):
	# st.markdown(response)

	# # Thêm câu trả lời của bot vào lịch sử
	# st.session_state.messages.append({"role": "assistant", "content": response})

	# Đọc file CSV và tạo dictionary từ file
	@st.cache_data
	def qa_dict():
	df = pd.read_csv("./data-law/Data_law_2807.csv") # Đường dẫn đến file CSV của bạn
	qa_dict = dict(zip(df['question'], df['answer']))
	return qa_dict
	qa_dict = qa_dict()

	st.title("General Law Chatbot")

	# Khởi tạo lịch sử tin nhắn
	if "messages" not in st.session_state:
	st.session_state.messages = []

	# Hiển thị các tin nhắn từ lịch sử
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Nhận input từ người dùng
	if prompt := st.chat_input("What is up?"):
	# Thêm tin nhắn của người dùng vào lịch sử
	st.session_state.messages.append({"role": "user", "content": prompt})

	# Hiển thị tin nhắn của người dùng trong giao diện
	with st.chat_message("user"):
	st.markdown(prompt)

	# Kiểm tra xem prompt có trong dictionary không
	if prompt in qa_dict:
	response = qa_dict[prompt]
	else:
	response = get_response(prompt)

	# Hiển thị câu trả lời của bot trong giao diện
	with st.chat_message("assistant"):
	st.markdown(response)

	# Thêm câu trả lời của bot vào lịch sử
	st.session_state.messages.append({"role": "assistant", "content": response})