minhdang14902 commited on
Commit
451f190
·
verified ·
1 Parent(s): 8c3f199

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +194 -324
app.py CHANGED
@@ -1,324 +1,194 @@
1
- # import streamlit as st
2
- # import torch
3
-
4
- # # Tiêu đề của ứng dụng
5
- # st.title('Hiển thị hình ảnh từ URL')
6
-
7
- # # URL hình ảnh mẫu
8
- # image_url = "https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png"
9
-
10
- # # Hiển thị hình ảnh
11
- # st.image(image_url, caption='Hình ảnh từ URL', use_column_width=True)
12
-
13
- import streamlit as st
14
- import torch
15
- from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
16
- import spacy
17
- import json
18
- import pandas as pd
19
- import re
20
- from transformers.models.roberta.modeling_roberta import *
21
-
22
- class MRCQuestionAnswering(RobertaPreTrainedModel):
23
- config_class = RobertaConfig
24
-
25
- def _reorder_cache(self, past, beam_idx):
26
- pass
27
-
28
- _keys_to_ignore_on_load_unexpected = [r"pooler"]
29
- _keys_to_ignore_on_load_missing = [r"position_ids"]
30
-
31
- def __init__(self, config):
32
- super().__init__(config)
33
- self.num_labels = config.num_labels
34
-
35
- self.roberta = RobertaModel(config, add_pooling_layer=False)
36
- self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
37
-
38
- self.init_weights()
39
-
40
- def forward(
41
- self,
42
- input_ids=None,
43
- words_lengths=None,
44
- start_idx=None,
45
- end_idx=None,
46
- attention_mask=None,
47
- token_type_ids=None,
48
- position_ids=None,
49
- head_mask=None,
50
- inputs_embeds=None,
51
- start_positions=None,
52
- end_positions=None,
53
- span_answer_ids=None,
54
- output_attentions=None,
55
- output_hidden_states=None,
56
- return_dict=None,
57
- ):
58
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
59
-
60
- outputs = self.roberta(
61
- input_ids,
62
- attention_mask=attention_mask,
63
- token_type_ids=None,
64
- position_ids=position_ids,
65
- head_mask=head_mask,
66
- inputs_embeds=inputs_embeds,
67
- output_attentions=output_attentions,
68
- output_hidden_states=output_hidden_states,
69
- return_dict=return_dict,
70
- )
71
-
72
- sequence_output = outputs[0]
73
-
74
- context_embedding = sequence_output
75
-
76
- batch_size = input_ids.shape[0]
77
- max_sub_word = input_ids.shape[1]
78
- max_word = words_lengths.shape[1]
79
- align_matrix = torch.zeros((batch_size, max_word, max_sub_word))
80
-
81
- for i, sample_length in enumerate(words_lengths):
82
- for j in range(len(sample_length)):
83
- start_idx = torch.sum(sample_length[:j])
84
- align_matrix[i][j][start_idx: start_idx + sample_length[j]] = 1 if sample_length[j] > 0 else 0
85
-
86
- align_matrix = align_matrix.to(context_embedding.device)
87
- context_embedding_align = torch.bmm(align_matrix, context_embedding)
88
-
89
- logits = self.qa_outputs(context_embedding_align)
90
- start_logits, end_logits = logits.split(1, dim=-1)
91
- start_logits = start_logits.squeeze(-1).contiguous()
92
- end_logits = end_logits.squeeze(-1).contiguous()
93
-
94
- total_loss = None
95
- if start_positions is not None and end_positions is not None:
96
- if len(start_positions.size()) > 1:
97
- start_positions = start_positions.squeeze(-1)
98
- if len(end_positions.size()) > 1:
99
- end_positions = end_positions.squeeze(-1)
100
- ignored_index = start_logits.size(1)
101
- start_positions = start_positions.clamp(0, ignored_index)
102
- end_positions = end_positions.clamp(0, ignored_index)
103
-
104
- loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
105
- start_loss = loss_fct(start_logits, start_positions)
106
- end_loss = loss_fct(end_logits, end_positions)
107
- total_loss = (start_loss + end_loss) / 2
108
-
109
- if not return_dict:
110
- output = (start_logits, end_logits) + outputs[2:]
111
- return ((total_loss,) + output) if total_loss is not None else output
112
-
113
- return QuestionAnsweringModelOutput(
114
- loss=total_loss,
115
- start_logits=start_logits,
116
- end_logits=end_logits,
117
- hidden_states=outputs.hidden_states,
118
- attentions=outputs.attentions,
119
- )
120
-
121
- from nltk import word_tokenize
122
- from transformers.models.auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
123
-
124
-
125
- def tokenize_function(example, tokenizer):
126
- question_word = word_tokenize(example["question"])
127
- context_word = word_tokenize(example["context"])
128
-
129
- question_sub_words_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(w)) for w in question_word]
130
- context_sub_words_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(w)) for w in context_word]
131
- valid = True
132
- if len([j for i in question_sub_words_ids + context_sub_words_ids for j in
133
- i]) > tokenizer.model_max_length - 1:
134
- valid = False
135
-
136
- question_sub_words_ids = [[tokenizer.bos_token_id]] + question_sub_words_ids + [[tokenizer.eos_token_id]]
137
- context_sub_words_ids = context_sub_words_ids + [[tokenizer.eos_token_id]]
138
-
139
- input_ids = [j for i in question_sub_words_ids + context_sub_words_ids for j in i]
140
- if len(input_ids) > tokenizer.model_max_length:
141
- valid = False
142
-
143
- words_lengths = [len(item) for item in question_sub_words_ids + context_sub_words_ids]
144
-
145
- return {
146
- "input_ids": input_ids,
147
- "words_lengths": words_lengths,
148
- "valid": valid
149
- }
150
- def data_collator(samples, tokenizer):
151
- if len(samples) == 0:
152
- return {}
153
-
154
- def collate_tokens(values, pad_idx, eos_idx=None, left_pad=False, move_eos_to_beginning=False):
155
- size = max(v.size(0) for v in values)
156
- res = values[0].new(len(values), size).fill_(pad_idx)
157
-
158
- def copy_tensor(src, dst):
159
- assert dst.numel() == src.numel()
160
- if move_eos_to_beginning:
161
- assert src[-1] == eos_idx
162
- dst[0] = eos_idx
163
- dst[1:] = src[:-1]
164
- else:
165
- dst.copy_(src)
166
-
167
- for i, v in enumerate(values):
168
- copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)])
169
- return res
170
-
171
- input_ids = collate_tokens([torch.tensor(item['input_ids']) for item in samples], pad_idx=tokenizer.pad_token_id)
172
- attention_mask = torch.zeros_like(input_ids)
173
- for i in range(len(samples)):
174
- attention_mask[i][:len(samples[i]['input_ids'])] = 1
175
- words_lengths = collate_tokens([torch.tensor(item['words_lengths']) for item in samples], pad_idx=0)
176
-
177
- batch_samples = {
178
- 'input_ids': input_ids,
179
- 'attention_mask': attention_mask,
180
- 'words_lengths': words_lengths,
181
- }
182
-
183
- return batch_samples
184
-
185
- def extract_answer(inputs, outputs, tokenizer):
186
- plain_result = []
187
- for sample_input, start_logit, end_logit in zip(inputs, outputs.start_logits, outputs.end_logits):
188
- sample_words_length = sample_input['words_lengths']
189
- input_ids = sample_input['input_ids']
190
- answer_start = sum(sample_words_length[:torch.argmax(start_logit)])
191
- answer_end = sum(sample_words_length[:torch.argmax(end_logit) + 1])
192
-
193
- if answer_start <= answer_end:
194
- answer = tokenizer.convert_tokens_to_string(
195
- tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
196
- if answer == tokenizer.bos_token:
197
- answer = ''
198
- else:
199
- answer = ''
200
-
201
- score_start = torch.max(torch.softmax(start_logit, dim=-1)).cpu().detach().numpy().tolist()
202
- score_end = torch.max(torch.softmax(end_logit, dim=-1)).cpu().detach().numpy().tolist()
203
- plain_result.append({
204
- "answer": answer,
205
- "score_start": score_start,
206
- "score_end": score_end
207
- })
208
- return plain_result
209
-
210
- # Load mô hình Phobert
211
- model_checkpoint = "minhdang14902/Roberta_edu"
212
- tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
213
- model = MRCQuestionAnswering.from_pretrained(model_checkpoint)
214
-
215
- # Load mô hình Roberta
216
- from transformers import AutoModelForSequenceClassification
217
- model_sentiment = AutoModelForSequenceClassification.from_pretrained('minhdang14902/PhoBert_Edu')
218
- tokenizer_sentiment = AutoTokenizer.from_pretrained('minhdang14902/PhoBert_Edu')
219
- chatbot_sentiment = pipeline("sentiment-analysis", model=model_sentiment, tokenizer=tokenizer_sentiment)
220
-
221
- import spacy
222
- import json
223
- # Khởi tạo mô hình spaCy tiếng Việt
224
- nlp = spacy.load('vi_core_news_lg')
225
- import pandas as pd
226
-
227
- def load_json_file(filename):
228
- with open(filename) as f:
229
- file = json.load(f)
230
- return file
231
-
232
- filename = './data/QA_Legal_converted_merged.json'
233
- intents = load_json_file(filename)
234
-
235
- def create_df():
236
- df = pd.DataFrame({
237
- 'Pattern' : [],
238
- 'Tag' : []
239
- })
240
- return df
241
-
242
- df = create_df()
243
-
244
- def extract_json_info(json_file, df):
245
- for intent in json_file['intents']:
246
- for pattern in intent['patterns']:
247
- sentence_tag = [pattern, intent['tag']]
248
- df.loc[len(df.index)] = sentence_tag
249
- return df
250
-
251
- df = extract_json_info(intents, df)
252
- df2 = df.copy()
253
-
254
- labels = df2['Tag'].unique().tolist()
255
- labels = [s.strip() for s in labels]
256
- num_labels = len(labels)
257
- id2label = {i: label for i, label in enumerate(labels)}
258
- label2id = {v: k for k, v in id2label.items()}
259
-
260
- def preprocess(text, df):
261
- def remove_numbers_and_special_chars(text):
262
- text = re.sub(r'\d+', '', text)
263
- text = re.sub(r'[^\w\s]', '', text)
264
- text = re.sub(r'\s+', ' ', text).strip()
265
- return text
266
-
267
- text = text.lower()
268
- text = remove_numbers_and_special_chars(text)
269
- text_nlp = nlp(text)
270
- filtered_sentence = [token.text for token in text_nlp if not token.is_stop]
271
- text = ' '.join(filtered_sentence)
272
-
273
- return text
274
-
275
- def predict(text):
276
- new_text = preprocess(text, df2)
277
- probs = chatbot_sentiment(new_text)
278
- predicted_label = max(probs, key=lambda x: x['score'])['label']
279
- return predicted_label
280
-
281
- # Thiết lập giao diện người dùng bằng Streamlit
282
- st.title("Vietnamese Legal Q&A Chatbot")
283
- st.write("Nhập câu hỏi của bạn về các vấn đề pháp lý:")
284
-
285
- user_question = st.text_input("Câu hỏi:")
286
-
287
- if st.button("Gửi câu hỏi"):
288
- if user_question:
289
- st.write("Câu hỏi của bạn:", user_question)
290
-
291
- # Tìm câu trả lời từ tập dữ liệu intents
292
- found_intent = None
293
- for intent in intents['intents']:
294
- if user_question.lower() in [pattern.lower() for pattern in intent['patterns']]:
295
- found_intent = intent
296
- break
297
-
298
- if found_intent:
299
- answer = found_intent['responses'][0]
300
- st.write("Câu trả lời:", answer)
301
- else:
302
- result = predict(user_question)
303
- if result:
304
- st.write("Thẻ dự đoán:", result)
305
-
306
- # Tạo đầu vào cho mô hình QA
307
- qa_inputs = [{
308
- 'context': found_intent['responses'][0] if found_intent else 'Tôi không có thông tin phù hợp.',
309
- 'question': user_question
310
- }]
311
-
312
- qa_features = []
313
- for qa_input in qa_inputs:
314
- feature = tokenize_function(qa_input, tokenizer)
315
- if feature["valid"]:
316
- qa_features.append(feature)
317
-
318
- qa_batch = data_collator(qa_features, tokenizer)
319
- with torch.no_grad():
320
- outputs = model(**qa_batch)
321
-
322
- answers = extract_answer(qa_features, outputs, tokenizer)
323
- best_answer = max(answers, key=lambda x: (x['score_start'] + x['score_end']) / 2)
324
- st.write("Câu trả lời:", best_answer['answer'])
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
4
+ import nltk
5
+ from transformers.models.roberta.modeling_roberta import *
6
+ from transformers import RobertaForQuestionAnswering
7
+ from nltk import word_tokenize
8
+ import spacy
9
+ import json
10
+ import pandas as pd
11
+ import re
12
+
13
+ # Download punkt for nltk
14
+ nltk.download('punkt')
15
+
16
+ # Load PhoBert model and tokenizer
17
+ phoBert_model = AutoModelForSequenceClassification.from_pretrained('minhdang14902/PhoBert_Edu')
18
+ phoBert_tokenizer = AutoTokenizer.from_pretrained('minhdang14902/PhoBert_Edu')
19
+ chatbot_pipeline = pipeline("sentiment-analysis", model=phoBert_model, tokenizer=phoBert_tokenizer)
20
+
21
+ # Load spaCy Vietnamese model
22
+ nlp = spacy.load('vi_core_news_lg')
23
+
24
+ # Load intents from json file
25
+ def load_json_file(filename):
26
+ with open(filename) as f:
27
+ file = json.load(f)
28
+ return file
29
+
30
+ filename = './data/QA_Legal_converted_merged.json'
31
+ intents = load_json_file(filename)
32
+
33
+ def create_df():
34
+ df = pd.DataFrame({
35
+ 'Pattern': [],
36
+ 'Tag': []
37
+ })
38
+ return df
39
+
40
+ df = create_df()
41
+
42
+ def extract_json_info(json_file, df):
43
+ for intent in json_file['intents']:
44
+ for pattern in intent['patterns']:
45
+ sentence_tag = [pattern, intent['tag']]
46
+ df.loc[len(df.index)] = sentence_tag
47
+ return df
48
+
49
+ df = extract_json_info(intents, df)
50
+ df2 = df.copy()
51
+
52
+ labels = df2['Tag'].unique().tolist()
53
+ labels = [s.strip() for s in labels]
54
+ num_labels = len(labels)
55
+ id2label = {id: label for id, label in enumerate(labels)}
56
+ label2id = {label: id for id, label in enumerate(labels)}
57
+
58
+ def tokenize_with_spacy(text):
59
+ doc = nlp(text)
60
+ tokens = [token.text for token in doc]
61
+ tokenized_text = ' '.join(tokens)
62
+ tokenized_text = re.sub(r'(?<!\s)([.,?])', r' \1', tokenized_text)
63
+ tokenized_text = re.sub(r'([.,?])(?!\s)', r'\1 ', tokenized_text)
64
+ return tokenized_text
65
+
66
+ # Load Roberta model and tokenizer
67
+ roberta_model_checkpoint = "minhdang14902/Roberta_edu"
68
+ roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_checkpoint)
69
+ roberta_model = MRCQuestionAnswering.from_pretrained(roberta_model_checkpoint)
70
+
71
+ def chatRoberta(text):
72
+ label = label2id[chatbot_pipeline(text)[0]['label']]
73
+ response = intents['intents'][label]['responses']
74
+
75
+ QA_input = {
76
+ 'question': text,
77
+ 'context': response[0]
78
+ }
79
+
80
+ # Tokenize input
81
+ encoded_input = tokenize_function(QA_input, roberta_tokenizer)
82
+
83
+ # Prepare batch samples
84
+ batch_samples = data_collator([encoded_input], roberta_tokenizer)
85
+
86
+ # Model prediction
87
+ roberta_model.eval()
88
+ with torch.no_grad():
89
+ inputs = {
90
+ 'input_ids': batch_samples['input_ids'],
91
+ 'attention_mask': batch_samples['attention_mask'],
92
+ 'words_lengths': batch_samples['words_lengths'],
93
+ }
94
+ outputs = roberta_model(**inputs)
95
+
96
+ # Extract answer
97
+ result = extract_answer([encoded_input], outputs, roberta_tokenizer)
98
+ return result
99
+
100
+ def tokenize_function(example, tokenizer):
101
+ question_word = word_tokenize(example["question"])
102
+ context_word = word_tokenize(example["context"])
103
+
104
+ question_sub_words_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(w)) for w in question_word]
105
+ context_sub_words_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(w)) for w in context_word]
106
+ valid = True
107
+ if len([j for i in question_sub_words_ids + context_sub_words_ids for j in i]) > tokenizer.model_max_length - 1:
108
+ valid = False
109
+
110
+ question_sub_words_ids = [[tokenizer.bos_token_id]] + question_sub_words_ids + [[tokenizer.eos_token_id]]
111
+ context_sub_words_ids = context_sub_words_ids + [[tokenizer.eos_token_id]]
112
+
113
+ input_ids = [j for i in question_sub_words_ids + context_sub_words_ids for j in i]
114
+ if len(input_ids) > tokenizer.model_max_length:
115
+ valid = False
116
+
117
+ words_lengths = [len(item) for item in question_sub_words_ids + context_sub_words_ids]
118
+
119
+ return {
120
+ "input_ids": input_ids,
121
+ "words_lengths": words_lengths,
122
+ "valid": valid
123
+ }
124
+
125
+ def data_collator(samples, tokenizer):
126
+ if len(samples) == 0:
127
+ return {}
128
+
129
+ def collate_tokens(values, pad_idx, eos_idx=None, left_pad=False, move_eos_to_beginning=False):
130
+ size = max(v.size(0) for v in values)
131
+ res = values[0].new(len(values), size).fill_(pad_idx)
132
+
133
+ def copy_tensor(src, dst):
134
+ assert dst.numel() == src.numel()
135
+ if move_eos_to_beginning:
136
+ assert src[-1] == eos_idx
137
+ dst[0] = eos_idx
138
+ dst[1:] = src[:-1]
139
+ else:
140
+ dst.copy_(src)
141
+
142
+ for i, v in enumerate(values):
143
+ copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)])
144
+ return res
145
+
146
+ input_ids = collate_tokens([torch.tensor(item['input_ids']) for item in samples], pad_idx=tokenizer.pad_token_id)
147
+ attention_mask = torch.zeros_like(input_ids)
148
+ for i in range(len(samples)):
149
+ attention_mask[i][:len(samples[i]['input_ids'])] = 1
150
+ words_lengths = collate_tokens([torch.tensor(item['words_lengths']) for item in samples], pad_idx=0)
151
+
152
+ batch_samples = {
153
+ 'input_ids': input_ids,
154
+ 'attention_mask': attention_mask,
155
+ 'words_lengths': words_lengths,
156
+ }
157
+
158
+ return batch_samples
159
+
160
+ def extract_answer(inputs, outputs, tokenizer):
161
+ plain_result = []
162
+ for sample_input, start_logit, end_logit in zip(inputs, outputs.start_logits, outputs.end_logits):
163
+ sample_words_length = sample_input['words_lengths']
164
+ input_ids = sample_input['input_ids']
165
+ answer_start = sum(sample_words_length[:torch.argmax(start_logit)])
166
+ answer_end = sum(sample_words_length[:torch.argmax(end_logit) + 1])
167
+
168
+ if answer_start <= answer_end:
169
+ answer = tokenizer.convert_tokens_to_string(
170
+ tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
171
+ if answer == tokenizer.bos_token:
172
+ answer = ''
173
+ else:
174
+ answer = ''
175
+
176
+ score_start = torch.max(torch.softmax(start_logit, dim=-1)).cpu().detach().numpy().tolist()
177
+ score_end = torch.max(torch.softmax(end_logit, dim=-1)).cpu().detach().numpy().tolist()
178
+ plain_result.append({
179
+ "answer": answer,
180
+ "score_start": score_start,
181
+ "score_end": score_end
182
+ })
183
+ return plain_result
184
+
185
+ st.title("Chatbot Interface")
186
+ st.write("Hi! I am your virtual assistant. Feel free to ask, and I'll do my best to provide you with answers and assistance.")
187
+ text = st.text_input("User: ")
188
+
189
+ if st.button("Submit"):
190
+ if text:
191
+ result = chatRoberta(text)
192
+ st.write(f"Chatbot: {result}")
193
+ else:
194
+ st.write("Please enter a message.")