import streamlit as st import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline import nltk from transformers.models.roberta.modeling_roberta import * from transformers import RobertaForQuestionAnswering from nltk import word_tokenize import json import pandas as pd import re # Download punkt for nltk nltk.download('punkt') # Load PhoBert model and tokenizer phoBert_model = AutoModelForSequenceClassification.from_pretrained('minhdang14902/PhoBert_Edu') phoBert_tokenizer = AutoTokenizer.from_pretrained('minhdang14902/PhoBert_Edu') chatbot_pipeline = pipeline("sentiment-analysis", model=phoBert_model, tokenizer=phoBert_tokenizer) # Load spaCy Vietnamese model # nlp = spacy.load('vi_core_news_lg') # Load intents from json file def load_json_file(filename): with open(filename) as f: file = json.load(f) return file filename = './data/QA_Legal_converted_merged.json' intents = load_json_file(filename) def create_df(): df = pd.DataFrame({ 'Pattern': [], 'Tag': [] }) return df df = create_df() def extract_json_info(json_file, df): for intent in json_file['intents']: for pattern in intent['patterns']: sentence_tag = [pattern, intent['tag']] df.loc[len(df.index)] = sentence_tag return df df = extract_json_info(intents, df) df2 = df.copy() labels = df2['Tag'].unique().tolist() labels = [s.strip() for s in labels] num_labels = len(labels) id2label = {id: label for id, label in enumerate(labels)} label2id = {label: id for id, label in enumerate(labels)} # def tokenize_with_spacy(text): # doc = nlp(text) # tokens = [token.text for token in doc] # tokenized_text = ' '.join(tokens) # tokenized_text = re.sub(r'(? tokenizer.model_max_length - 1: valid = False question_sub_words_ids = [[tokenizer.bos_token_id]] + question_sub_words_ids + [[tokenizer.eos_token_id]] context_sub_words_ids = context_sub_words_ids + [[tokenizer.eos_token_id]] input_ids = [j for i in question_sub_words_ids + context_sub_words_ids for j in i] if len(input_ids) > tokenizer.model_max_length: valid = False words_lengths = [len(item) for item in question_sub_words_ids + context_sub_words_ids] return { "input_ids": input_ids, "words_lengths": words_lengths, "valid": valid } def data_collator(samples, tokenizer): if len(samples) == 0: return {} def collate_tokens(values, pad_idx, eos_idx=None, left_pad=False, move_eos_to_beginning=False): size = max(v.size(0) for v in values) res = values[0].new(len(values), size).fill_(pad_idx) def copy_tensor(src, dst): assert dst.numel() == src.numel() if move_eos_to_beginning: assert src[-1] == eos_idx dst[0] = eos_idx dst[1:] = src[:-1] else: dst.copy_(src) for i, v in enumerate(values): copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)]) return res input_ids = collate_tokens([torch.tensor(item['input_ids']) for item in samples], pad_idx=tokenizer.pad_token_id) attention_mask = torch.zeros_like(input_ids) for i in range(len(samples)): attention_mask[i][:len(samples[i]['input_ids'])] = 1 words_lengths = collate_tokens([torch.tensor(item['words_lengths']) for item in samples], pad_idx=0) batch_samples = { 'input_ids': input_ids, 'attention_mask': attention_mask, 'words_lengths': words_lengths, } return batch_samples def extract_answer(inputs, outputs, tokenizer): plain_result = [] for sample_input, start_logit, end_logit in zip(inputs, outputs.start_logits, outputs.end_logits): sample_words_length = sample_input['words_lengths'] input_ids = sample_input['input_ids'] answer_start = sum(sample_words_length[:torch.argmax(start_logit)]) answer_end = sum(sample_words_length[:torch.argmax(end_logit) + 1]) if answer_start <= answer_end: answer = tokenizer.convert_tokens_to_string( tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])) if answer == tokenizer.bos_token: answer = '' else: answer = '' score_start = torch.max(torch.softmax(start_logit, dim=-1)).cpu().detach().numpy().tolist() score_end = torch.max(torch.softmax(end_logit, dim=-1)).cpu().detach().numpy().tolist() plain_result.append({ "answer": answer, "score_start": score_start, "score_end": score_end }) return plain_result st.title("Chatbot Interface") st.write("Hi! I am your virtual assistant. Feel free to ask, and I'll do my best to provide you with answers and assistance.") text = st.text_input("User: ") if st.button("Submit"): if text: result = chatRoberta(text) st.write(f"Chatbot: {result}") else: st.write("Please enter a message.")