File size: 6,785 Bytes
5109aa7
933dfa2
451f190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d727a1f
451f190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d727a1f
 
 
 
 
 
 
451f190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196


import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import nltk
from transformers.models.roberta.modeling_roberta import *
from transformers import RobertaForQuestionAnswering
from nltk import word_tokenize
import json
import pandas as pd
import re

# Download punkt for nltk
nltk.download('punkt')

# Load PhoBert model and tokenizer
phoBert_model = AutoModelForSequenceClassification.from_pretrained('minhdang14902/PhoBert_Edu')
phoBert_tokenizer = AutoTokenizer.from_pretrained('minhdang14902/PhoBert_Edu')
chatbot_pipeline = pipeline("sentiment-analysis", model=phoBert_model, tokenizer=phoBert_tokenizer)

# Load spaCy Vietnamese model
# nlp = spacy.load('vi_core_news_lg')

# Load intents from json file
def load_json_file(filename):
    with open(filename) as f:
        file = json.load(f)
    return file

filename = './data/QA_Legal_converted_merged.json'
intents = load_json_file(filename)

def create_df():
    df = pd.DataFrame({
        'Pattern': [],
        'Tag': []
    })
    return df

df = create_df()

def extract_json_info(json_file, df):
    for intent in json_file['intents']:
        for pattern in intent['patterns']:
            sentence_tag = [pattern, intent['tag']]
            df.loc[len(df.index)] = sentence_tag
    return df

df = extract_json_info(intents, df)
df2 = df.copy()

labels = df2['Tag'].unique().tolist()
labels = [s.strip() for s in labels]
num_labels = len(labels)
id2label = {id: label for id, label in enumerate(labels)}
label2id = {label: id for id, label in enumerate(labels)}

# def tokenize_with_spacy(text):
#     doc = nlp(text)
#     tokens = [token.text for token in doc]
#     tokenized_text = ' '.join(tokens)
#     tokenized_text = re.sub(r'(?<!\s)([.,?])', r' \1', tokenized_text)
#     tokenized_text = re.sub(r'([.,?])(?!\s)', r'\1 ', tokenized_text)
#     return tokenized_text

# Load Roberta model and tokenizer
roberta_model_checkpoint = "minhdang14902/Roberta_edu"
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_checkpoint)
roberta_model = MRCQuestionAnswering.from_pretrained(roberta_model_checkpoint)

def chatRoberta(text):
    label = label2id[chatbot_pipeline(text)[0]['label']]
    response = intents['intents'][label]['responses']

    QA_input = {
        'question': text,
        'context': response[0]
    }

    # Tokenize input
    encoded_input = tokenize_function(QA_input, roberta_tokenizer)

    # Prepare batch samples
    batch_samples = data_collator([encoded_input], roberta_tokenizer)

    # Model prediction
    roberta_model.eval()
    with torch.no_grad():
        inputs = {
            'input_ids': batch_samples['input_ids'],
            'attention_mask': batch_samples['attention_mask'],
            'words_lengths': batch_samples['words_lengths'],
        }
        outputs = roberta_model(**inputs)

    # Extract answer
    result = extract_answer([encoded_input], outputs, roberta_tokenizer)
    return result

def tokenize_function(example, tokenizer):
    question_word = word_tokenize(example["question"])
    context_word = word_tokenize(example["context"])

    question_sub_words_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(w)) for w in question_word]
    context_sub_words_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(w)) for w in context_word]
    valid = True
    if len([j for i in question_sub_words_ids + context_sub_words_ids for j in i]) > tokenizer.model_max_length - 1:
        valid = False

    question_sub_words_ids = [[tokenizer.bos_token_id]] + question_sub_words_ids + [[tokenizer.eos_token_id]]
    context_sub_words_ids = context_sub_words_ids + [[tokenizer.eos_token_id]]

    input_ids = [j for i in question_sub_words_ids + context_sub_words_ids for j in i]
    if len(input_ids) > tokenizer.model_max_length:
        valid = False

    words_lengths = [len(item) for item in question_sub_words_ids + context_sub_words_ids]

    return {
        "input_ids": input_ids,
        "words_lengths": words_lengths,
        "valid": valid
    }

def data_collator(samples, tokenizer):
    if len(samples) == 0:
        return {}

    def collate_tokens(values, pad_idx, eos_idx=None, left_pad=False, move_eos_to_beginning=False):
        size = max(v.size(0) for v in values)
        res = values[0].new(len(values), size).fill_(pad_idx)

        def copy_tensor(src, dst):
            assert dst.numel() == src.numel()
            if move_eos_to_beginning:
                assert src[-1] == eos_idx
                dst[0] = eos_idx
                dst[1:] = src[:-1]
            else:
                dst.copy_(src)

        for i, v in enumerate(values):
            copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)])
        return res

    input_ids = collate_tokens([torch.tensor(item['input_ids']) for item in samples], pad_idx=tokenizer.pad_token_id)
    attention_mask = torch.zeros_like(input_ids)
    for i in range(len(samples)):
        attention_mask[i][:len(samples[i]['input_ids'])] = 1
    words_lengths = collate_tokens([torch.tensor(item['words_lengths']) for item in samples], pad_idx=0)

    batch_samples = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'words_lengths': words_lengths,
    }

    return batch_samples

def extract_answer(inputs, outputs, tokenizer):
    plain_result = []
    for sample_input, start_logit, end_logit in zip(inputs, outputs.start_logits, outputs.end_logits):
        sample_words_length = sample_input['words_lengths']
        input_ids = sample_input['input_ids']
        answer_start = sum(sample_words_length[:torch.argmax(start_logit)])
        answer_end = sum(sample_words_length[:torch.argmax(end_logit) + 1])

        if answer_start <= answer_end:
            answer = tokenizer.convert_tokens_to_string(
                tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
            if answer == tokenizer.bos_token:
                answer = ''
        else:
            answer = ''

        score_start = torch.max(torch.softmax(start_logit, dim=-1)).cpu().detach().numpy().tolist()
        score_end = torch.max(torch.softmax(end_logit, dim=-1)).cpu().detach().numpy().tolist()
        plain_result.append({
            "answer": answer,
            "score_start": score_start,
            "score_end": score_end
        })
    return plain_result

st.title("Chatbot Interface")
st.write("Hi! I am your virtual assistant. Feel free to ask, and I'll do my best to provide you with answers and assistance.")
text = st.text_input("User: ")

if st.button("Submit"):
    if text:
        result = chatRoberta(text)
        st.write(f"Chatbot: {result}")
    else:
        st.write("Please enter a message.")