fmyaidha commited on
Commit
6b94ed1
·
verified ·
1 Parent(s): 2d3164d

Upload 18 files

Browse files
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  gradio/output_files/semantic_chunk.index filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  gradio/output_files/semantic_chunk.index filter=lfs diff=lfs merge=lfs -text
37
+ output_files/semantic_chunk.index filter=lfs diff=lfs merge=lfs -text
__pycache__/config.cpython-310.pyc ADDED
Binary file (942 Bytes). View file
 
__pycache__/config.cpython-312.pyc ADDED
Binary file (989 Bytes). View file
 
__pycache__/english_search.cpython-310.pyc ADDED
Binary file (2.68 kB). View file
 
__pycache__/retrievor.cpython-310.pyc ADDED
Binary file (11.2 kB). View file
 
__pycache__/text2vec.cpython-310.pyc ADDED
Binary file (4.93 kB). View file
 
config.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Config():
2
+
3
+ #retrievor参数
4
+ topd = 3 #召回文章的数量
5
+ topt = 6 #召回文本片段的数量
6
+ maxlen = 128 #召回文本片段的长度
7
+ topk = 5 #query召回的关键词数量
8
+ bert_path = '/workspace/model/embedding/tao-8k'
9
+ recall_way = 'embed' #召回方式 ,keyword,embed
10
+
11
+ #generator参数
12
+ max_source_length = 767 #输入的最大长度
13
+ max_target_length = 256 #生成的最大长度
14
+ model_max_length = 1024 #序列最大长度
15
+
16
+ #embedding API 参数 - 用于 text2vec.py
17
+ use_api = True # 是否使用API而非本地模型
18
+ api_key = "sk-1c7a2cd7244e4fc7b65f2c1f4c2b949c"
19
+ base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
20
+ model_name = "text-embedding-v3"
21
+ dimensions = 1024
22
+ batch_size = 10
23
+
24
+ #LLM API 参数 - 用于 rag.py
25
+ llm_api_key = "sk-1c7a2cd7244e4fc7b65f2c1f4c2b949c" # 与embedding共用同一个key
26
+ llm_base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" # 与embedding共用同一个URL
27
+ llm_model = "qwen-plus" # 默认使用的LLM模型
28
+
29
+ # 知识库配置
30
+ kb_base_dir = "knowledge_bases" # 知识库根目录
31
+ default_kb = "default" # 默认知识库名称
32
+
33
+ # 输出目录配置 - 现在用作临时文件目录
34
+ output_dir = "output_files"
knowledge_base.txt ADDED
File without changes
output/knowledge_base.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Knowledge Base
2
+
3
+ This file will be populated when you upload documents.
output_files/knowledge_base.txt ADDED
@@ -0,0 +1,595 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ ----- File: 0cb37d0c-2bae-47f9-bcd4-54636d9f15e4.pdf -----
4
+
5
+ Empowering Few-Shot Relation Extraction with
6
+ The Integration of Traditional RE Methods and
7
+ Large Language Models
8
+ Ye Liu, Kai Zhang(B), Aoran Gan, Linan Yue, Feng Hu,
9
+ Qi Liu, Enhong Chen
10
+ 1School of Data Science, School of Computer Science and Technology,
11
+ University of Science and Technology of China
12
+ 2State Key Laboratory of Cognitive Intelligence
13
+ {liuyer,gar,lnyue,fenghufh3}@mail.ustc.edu.cn
14
+ {kkzhang08,qiliuql,cheneh}@ustc.edu.cn
15
+ Abstract. Few-Shot Relation Extraction (FSRE), a subtask of Relation
16
+ Extraction (RE) that utilizes limited training instances, appeals to more
17
+ researchers in Natural Language Processing (NLP) due to its capabil-
18
+ ity to extract textual information in extremely low-resource scenarios.
19
+ The primary methodologies employed for FSRE have been fine-tuning
20
+ or prompt tuning techniques based on Pre-trained Language Models
21
+ (PLMs). Recently, the emergence of Large Language Models (LLMs) has
22
+ prompted numerous researchers to explore FSRE through In-Context
23
+ Learning (ICL). However, there are substantial limitations associated
24
+ with methods based on either traditional RE models or LLMs. Tradi-
25
+ tional RE models are hampered by a lack of necessary prior knowledge,
26
+ while LLMs fall short in their task-specific capabilities for RE. To ad-
27
+ dress these shortcomings, we propose a Dual-System Augmented Rela-
28
+ tion Extractor (DSARE), which synergistically combines traditional RE
29
+ models with LLMs. Specifically, DSARE innovatively injects the prior
30
+ knowledge of LLMs into traditional RE models, and conversely enhances
31
+ LLMs’ task-specific aptitude for RE through relation extraction augmen-
32
+ tation. Moreover, an Integrated Prediction module is employed to jointly
33
+ consider these two respective predictions and derive the final results. Ex-
34
+ tensive experiments demonstrate the efficacy of our proposed method.
35
+ Keywords: Relation Extraction · Few Shot · Large Language Models.
36
+ 1
37
+ Introduction
38
+ Relation Extraction (RE) aims to determine the relation expressed between two
39
+ entities within an unstructured textual context [23]. Few-Shot Relation Extrac-
40
+ tion (FSRE), as a subtask of RE, seeks to solve the RE problem by utilizing only
41
+ K instances per relation (K-shot) in the training and validation phases [3,20].
42
+ The primary methodologies employed to address the FSRE task have been
43
+ fine-tuning or prompt tuning techniques grounded on Pre-trained Language
44
+ arXiv:2407.08967v1 [cs.CL] 12 Jul 2024
45
+ Traditional RE Method
46
+ LLM-based RE Method
47
+ 1. Special Design for RE Task: 
48
+ 2. Lack Prior Knowledge: 
49
+ 1. Rich Prior Knowledge: 
50
+ 2. Cannot Understand RE Task Well: 
51
+ What's the relation between "National Action
52
+ Network" and "Rev" in the context "Speaking..."
53
+ Sorry, I don't understand your question. Do you
54
+ mean ... ?
55
+ Too fewer data to obtain
56
+ enough knowledge ...
57
+ Few-Shot Data
58
+ Typed Marker
59
+ PLMs
60
+ Fig. 1. The comparison between traditional RE methods and LLM-based RE methods.
61
+ Models (PLMs) [3,23]. Recently, with the emergence of Large Language Mod-
62
+ els (LLMs), numerous researchers have embarked on the exploration of FSRE
63
+ through the In-Context Learning (ICL) technology [5,19,20]. However, there are
64
+ substantial limitations associated with methods based on either traditional RE
65
+ models or LLMs. As depicted in Figure 1, although most traditional RE methods
66
+ are custom-built for the RE task, they still lack necessary prior knowledge that
67
+ is crucial for resolving many domain-specific cases [3,6]. Acquiring such prior
68
+ knowledge is particularly challenging in extremely low-resource settings, such as
69
+ an 8-shot scenario. On the other hand, methods based on LLMs present a con-
70
+ trasting issue. With the scaling of model size and corpus size, LLMs possess an
71
+ extraordinary amount of prior knowledge. Nevertheless, given that these LLMs
72
+ are designed for general usage, they lack the task-specific ability for RE, which
73
+ makes it difficult to fully harness their prior knowledge. This dichotomy between
74
+ the strengths and weaknesses of traditional RE models and LLMs presents a
75
+ novel perspective in the field of few-shot relation extraction.
76
+ To this end, this paper proposes a novel approach that amalgamates the
77
+ traditional RE methods with LLMs. By doing so, we aim to address the afore-
78
+ mentioned shortcomings by capitalizing on their respective strengths. Specifi-
79
+ cally, we develop a Dual-System Augmented Relation Extractor (DSARE) for
80
+ few-shot relation extraction. DSARE consists of three key components: (a) A
81
+ LLM-augmented RE module: This module designs prompts that enable LLMs
82
+ to generate additional in-domain labeled data to boost the training of traditional
83
+ RE models, thereby effectively injecting the prior knowledge of LLMs into the
84
+ traditional RE methods. (b) A RE-augmented LLM module: This module uti-
85
+ lizes the trained RE model to identify and retrieve the most valuable samples
86
+ from the training data. These samples are subsequently employed as demonstra-
87
+ tions for the In-Context Learning of LLMs, thereby enhancing their RE-specific
88
+ aptitude. (c) An Integrated Prediction module: It takes into account the predic-
89
+ tions of both the LLM-augmented RE and RE-augmented LLM modules. When
90
+ the two predictions differ, a specially designed selector is activated to make a
91
+ final decision. Finally, extensive experiments on three publicly available datasets
92
+ demonstrate the effectiveness of our proposed method, and further indicate the
93
+ necessity to integrate traditional RE models and LLMs.
94
+ Our code is available via https://github.com/liuyeah/DSARE.
95
+ 2
96
+ Related Work
97
+ Few-shot Relation Extraction. Due to the large computation ability of pre-
98
+ trained language models, existing few-shot relation extraction methods mainly
99
+ adopt the fine-tuning method to solve the few-shot relation extraction prob-
100
+ lem [13,23]. In recent years, in order to bridge the gap between pre-training
101
+ objectives and RE task, prompt tuning has been proposed and demonstrated
102
+ remarkable capability in low-resource scenarios [3,6,7].
103
+ Currently, with the arise of large language models, many researchers at-
104
+ tempt to tackle few-shot relation extraction via In-Context Learning technol-
105
+ ogy [5,19,20]. However, these approaches simply apply LLMs to few-shot relation
106
+ extraction tasks through straightforward queries, which fails to fully harness the
107
+ potential of LLMs. More importantly, they overlook the possibility that LLMs
108
+ and traditional RE models could mutually enhance each other’s performance.
109
+ Large Language Models. The emergence of Large Language Models (LLMs)
110
+ such as GPT-4, LLama-2 and others [14,15,17,18], represents a significant ad-
111
+ vancement in the field of natural language processing. By leveraging In-Context
112
+ Learning, a novel few-shot learning paradigm was first introduced by [2]. Up to
113
+ now, LLMs have demonstrated remarkable performance across a range of NLP
114
+ tasks, such as text classification, named entity recognition, question answering
115
+ and relation extraction [5,8,19,20].
116
+ Previous research efforts [5,19,20] have sought to solve few-shot relation ex-
117
+ traction by directly asking LLMs or retrieving more suitable demonstrations.
118
+ For instance, Wan et al. [19] attempted to introduce the label-induced reasoning
119
+ logic to enrich the demonstrations. Meanwhile, Xu et al. [20] designed task-
120
+ related instructions and a schema-constrained data generation strategy, which
121
+ could boost previous RE methods to obtain state-of-the-art few-shot results.
122
+ 3
123
+ Problem Statement
124
+ Let C denote the input text and esub ∈C, eobj ∈C denote the pair of subject
125
+ and object entities. Given the entity type of esub, eobj, and a set of pre-defined
126
+ relation classes R, relation extraction aims to predict the relation y ∈R between
127
+ the pair of entities (esub, eobj) within the context C [19,23].
128
+ As for the few-shot settings, following the strategy adopted by [4,20], we ran-
129
+ domly sample K instances per relation (K-shot) for the training and validation
130
+ phases. The whole test set is preserved to ensure the effectiveness of evaluation.
131
+ 4
132
+ DSARE Model
133
+ 4.1
134
+ LLM-augmented RE
135
+ LLM Data Augmentation. In this part, we aim to implement the data aug-
136
+ mentation via LLMs, anticipated to enrich the training data for relation extrac-
137
+ tion. Specifically, drawing inspiration from [20], we construct prompts to tell the
138
+ Input
139
+ (a)  LLM-augmented RE
140
+ One sample consists of a relation, a context,
141
+ a pair of head and tail entities ....  Generate
142
+ more samples like above for the relation:
143
+ Relation: org: found_by
144
+ Context: Steve Jobs is the co-founder of     
145
+                  Apple Inc.
146
+ ......
147
+ Typed entity marker
148
+ (punct).
149
+ PLM
150
+ Classification Layer
151
+ Given a context, a pair of head and tail
152
+ entities in the context, decide the
153
+ relationship between .... The relation
154
+ between xxx and xxx is 
155
+ Relation: org: found
156
+ (b)  RE-augmented LLM
157
+ (c) Integrated Prediction
158
+ Y
159
+ N
160
+ Document:
161
+       Speaking to a meeting of
162
+ the National Action Network , a
163
+ civil rights organization founded by
164
+ the Rev.
165
+ Subject Entity:
166
+       National Action Network
167
+ Subject Entity Type:
168
+       Organization
169
+ Object Entity:
170
+       Rev
171
+ Object Entity Type:
172
+       Person
173
+ (a.1) LLM Data Augmentation
174
+ (a.2) Traditional RE Model
175
+ (b.1) KNN Demonstration
176
+ (b.2) LLM Inference
177
+ Selector
178
+ Fig. 2. The architecture of DSARE. It includes three parts: (a) LLM-augmented RE
179
+ module; (b) RE-augmented LLM module; (c) Integrated Prediction module.
180
+ LLM the essential components of one RE training sample, i.e., context text, sub-
181
+ ject entity, object entity, subject entity type, object entity type and the relation.
182
+ Then the LLM is guided to create more pseudo RE samples. Upon receiving
183
+ the outputs from the LLM, we establish rules, such as regular expressions, to
184
+ transform the augmented RE data into the desired format.
185
+ Traditional RE Model. With the augmented datasets, we obtain more diverse
186
+ data to train a traditional RE model. Here we adopt the Typed Entity Marker
187
+ (punct) method proposed by [23] to denote the entity and context text, and
188
+ further train a relation extraction model. Specifically, we utilize the symbols “@”
189
+ and “#” to denote the start/end of the subject and object entities, and further
190
+ adopt the symbols “∗” and “∧” to indicate the subject and object entity types,
191
+ respectively. The processed text is then fed into the pre-trained language model
192
+ to obtain the representations of the subject and object entities (hsub, hobj) via
193
+ the special token “@” and “#”. Finally, we pass (hsub, hobj) into a classification
194
+ layer to derive the results.
195
+ 4.2
196
+ RE-augmented LLM
197
+ KNN Demonstration. In Section 4.1, we train a traditional relation extraction
198
+ model, which allows us to implement a k-nearest neighbors (KNN) search method
199
+ to retrieve more valuable samples from the training set. Specifically, we utilize
200
+ the obtained entity representation H = [hsub, hobj] to represent each sample,
201
+ and further obtain the representation and label pair (Hi, ri) on the training set,
202
+ which we denote as a datastore D.
203
+ When inferring a new sample j, we utilize its entity representation Hj to
204
+ query D according to the euclidean distance to obtain the k nearest neighbors:
205
+ N = {(Hi, ri)}k
206
+ i=1, which we adopt as demonstrations for LLM inference.
207
+ Table 1. Data Statistics
208
+ Dataset
209
+ #Train
210
+ #Dev
211
+ #Test
212
+ #Rel
213
+ TACRED
214
+ 8/16/32
215
+ 8/16/32
216
+ 15,509
217
+ 42
218
+ TACREV
219
+ 8/16/32
220
+ 8/16/32
221
+ 15,509
222
+ 42
223
+ Re-TACRED
224
+ 8/16/32
225
+ 8/16/32
226
+ 13,418
227
+ 40
228
+ LLM Inference. After obtaining the effective demonstrations, we design prompts
229
+ to provide the essential information to the LLM, thus generating the LLM re-
230
+ sults. Specifically, inspired by the various attempts about ICL [20], we first de-
231
+ scribe the target of the relation extraction task through a instruction. Then, the
232
+ retrieved k nearest neighbors N = {(Hi, ri)}k
233
+ i=1 of current sample are followed,
234
+ which provide the most relevant information to the LLM. Finally, we ask the
235
+ LLM to predict the relation of current sample.
236
+ 4.3
237
+ Integrated Prediction
238
+ In Section 4.1 and 4.2, we apply traditional RE models and LLMs to conduct
239
+ few-shot relation extraction from dual perspectives. In this part, we aim to obtain
240
+ the final outputs by considering both the LLM-augmented RE inference result
241
+ Pre and the RE-augmented LLM inference result Pllm.
242
+ More specifically, as illustrated in Figure 2, if the two results are equal (i.e.,
243
+ Pre = Pllm), our model directly yields the predicted relation. In circumstances
244
+ where the two results diverge, we design a selector to further ask the LLM to
245
+ make a choice between these two relations. In order to improve the effectiveness
246
+ of the selector, we directly retrieve m samples labeled with these two relations
247
+ from the training dataset, respectively. Subsequently, we ask the LLM via a
248
+ similar way we introduced in LLM Inference to obtain the final results1.
249
+ 5
250
+ Experiments
251
+ 5.1
252
+ Experimental Setup
253
+ Datasets and Evaluation Metrics. For extensive experiments, we conduct
254
+ our experiments on three widely-used relation extraction datasets: TACRED [22],
255
+ TACREV [1] and Re-TACRED [16]. More statistics about the datasets can be
256
+ found in Table 1. Regarding the evaluation metrics, we adopt the micro-F1 scores
257
+ of RE as the primary metric to evaluate models, considering that F1 scores can
258
+ assess the overall performance of precision and recall [3,9,10,21].
259
+ 1 If the LLM does not make an inference or we are unable to convert the output into
260
+ the pre-defined relations, we will conclude there is no relation between subject and
261
+ object entities. Note that no_relation is also a relation category in these datasets.
262
+ Implementation Details. In this paper, we utilize the zephyr-7b-alpha [18]
263
+ model on Huggingface as the LLM to conduct experiments.
264
+ In the Traditional RE Model part (Section 4.1), we adopt roberta-large [11]
265
+ as the base architecture. The batch size is set to 4, and the model is optimized
266
+ by AdamW [12] with a learning rate of 3e−5. We train the model on the training
267
+ set for 50 epochs and choose the best epoch based on the micro-F1 performance
268
+ on the development set.
269
+ In the LLM Data Augmentation part (Section 4.1), we double the K-shot
270
+ training set through LLMs. That is to say, for an 8-shot training set, we con-
271
+ struct 8 pieces of pseudo data per relation, thereby creating the final augmented
272
+ training set. In the KNN Demonstration part (Section 4.2), we set the number
273
+ of retrieved nearest neighbors as k = 8. In the Integrated Prediction module
274
+ (Section 4.3), we set the number of retrieved samples for each relation as m = 4.
275
+ Benchmark Methods. We compare our methods with the state-of-the-art few-
276
+ shot relation extraction methods. According to the modeling architecture, they
277
+ can be grouped into three categories, including Traditional RE Methods (①∼
278
+ ④), LLM-based Methods (⑤∼⑦) and Hybrid Methods (⑧).
279
+ – ①TYP Marker [23] proposes to incorporate entity representations with
280
+ typed markers, which presents remarkable performance on the RE task.
281
+ – ②PTR [7] designs prompt tuning with rules for relation extraction tasks
282
+ and applies logic rules to construct prompts with several sub-prompts.
283
+ – ③KnowPrompt [3] innovatively injects the latent knowledge contained
284
+ in relation labels into prompt construction with the learnable virtual type
285
+ words and answer words.
286
+ – ④GenPT [6] proposes a novel generative prompt tuning method to re-
287
+ formulate relation classification as an infilling problem, which exploits rich
288
+ semantics of entity and relation types.
289
+ – ⑤GPT-3.5 [15], ⑥LLama-2 [17], ⑦Zephyr [18] are the advanced LLMs.
290
+ We leverage the API for GPT-3.5, while adopt the 7B version for LLama-2
291
+ (llama-2-7b-chat-hf) and Zephyr (zephyr-7b-alpha). We utlize the prompt
292
+ from [20] to conduct In-Context Learning.
293
+ – ⑧Unleash [20] proposes schema-constrained data generation methods2
294
+ through LLMs, which boost previous RE methods (i.e., KnowPrompt) to
295
+ obtain more competitive results.
296
+ It is worth noting that, for these LLM-based baselines (⑤∼⑦), due to the
297
+ limitations of maximum tokens and the fact that these datasets have at least
298
+ 40 relations, we utilize the one-shot demonstration per relation following the
299
+ strategy proposed by [20]. In contrast, our DSARE method, as introduced in the
300
+ Implementation Details part, requires a maximum of 16 demonstrations3, which
301
+ 2 For fair comparison, we apply this data generation method to double the training
302
+ set, which is the same as our settings introduced in the Implementation Details part.
303
+ 3 In the KNN Demonstration part (Section 4.2), the number of retrieved nearest neigh-
304
+ bors is k = 8. And in the Integrated Prediction module (Section 4.3), we need a
305
+ maximum of 2m = 8 additional demonstrations.
306
+ Table 2. Experimental Results (%)
307
+ Methods
308
+ TACRED
309
+ TACREV
310
+ Re-TACRED
311
+ K=8
312
+ K=16 K=32
313
+ K=8
314
+ K=16 K=32
315
+ K=8
316
+ K=16 K=32
317
+ ①TYP Marker
318
+ 29.02
319
+ 31.35
320
+ 31.86
321
+ 26.28
322
+ 29.24
323
+ 31.55
324
+ 51.32
325
+ 55.60
326
+ 57.82
327
+ ②PTR
328
+ 28.34
329
+ 29.39
330
+ 30.45
331
+ 28.63
332
+ 29.75
333
+ 30.79
334
+ 47.80
335
+ 53.83
336
+ 60.99
337
+ ③KnowPrompt
338
+ 30.30
339
+ 33.53
340
+ 34.42
341
+ 30.47
342
+ 33.54
343
+ 33.86
344
+ 56.74
345
+ 61.90
346
+ 65.92
347
+ ④GenPT
348
+ 35.45
349
+ 35.58
350
+ 35.61
351
+ 33.81
352
+ 33.93
353
+ 36.72
354
+ 57.03
355
+ 57.66
356
+ 65.25
357
+ ⑤GPT-3.5
358
+ 29.72
359
+ 29.98
360
+ 39.06
361
+ ⑥LLama-2
362
+ 22.68
363
+ 21.96
364
+ 34.31
365
+ ⑦Zephyr
366
+ 37.10
367
+ 38.83
368
+ 35.81
369
+ ⑧Unleash
370
+ 32.24
371
+ 33.81
372
+ 34.76
373
+ 32.70
374
+ 34.53
375
+ 35.28
376
+ 58.29
377
+ 64.37
378
+ 66.03
379
+ DSARE (ours) 43.84 45.40 45.94 44.69 46.61 46.94 60.04 66.83 67.13
380
+ is much fewer than the number of the one-shot demonstration per relation setting
381
+ (>= 40), thus avoiding unfair comparison.
382
+ 5.2
383
+ Experimental Result
384
+ The main results are illustrated in Table 2. Our proposed DSARE model outper-
385
+ forms all baselines across all metrics. Particularly on the TACRED and TACREV
386
+ datasets, our method manifests a significant advantage. This demonstrates the
387
+ effectiveness of our designs and the benefits of integrating traditional RE models
388
+ and LLMs. Furthermore, there are also some interesting phenomena:
389
+ First, the vast majority of methods exhibit superior performance on the Re-
390
+ TACRED dataset compared to the TACRED and TACREV datasets. This is
391
+ reasonable as Re-TACRED is an improved version among these three datasets,
392
+ which addresses some shortcomings of the original TACRED dataset, refactors
393
+ its training set, development set and test set. The more precise labels contribute
394
+ to the learning process of these models, thereby yielding superior performance.
395
+ Second, among these LLM-based methods, Zephyr (7B) demonstrates compet-
396
+ itive performance and significantly outperforms GPT-3.5 and LLama-2 on the
397
+ TACRED and TACREV datasets. This proves its strong information extraction
398
+ ability, as claimed in [18]. Third, Unleash introduces a schema-constrained data
399
+ augmentation method through LLMs to enhance the Knowprompt baselines. It
400
+ achieves a certain degree of improvement compared to Knowprompt, verifying
401
+ the the feasibility of this line of thinking. And our DSARE model significantly
402
+ surpasses Unleash, which further demonstrates the effectiveness of our designs
403
+ from another perspective.
404
+ 5.3
405
+ Ablation Study
406
+ In this subsection, we carry out ablation experiments to validate the effectiveness
407
+ of various components of DSARE model. Specifically, we first remove the Inte-
408
+ grated Prediction module, consequently leading to two ablated variants: LLM-
409
+ augmented RE and RE-augmented LLM. As shown in Table 3, there are obvious
410
+ Table 3. Ablation Experiments (%)
411
+ Ablation Models
412
+ Re-TACRED
413
+ K=8
414
+ K=16
415
+ K=32
416
+ DSARE
417
+ 60.04
418
+ 66.83
419
+ 67.13
420
+ LLM-augmented RE
421
+ 52.53
422
+ 58.01
423
+ 58.56
424
+ RE-augmented LLM
425
+ 56.38
426
+ 64.85
427
+ 66.03
428
+ Pure RE
429
+ 51.32
430
+ 55.60
431
+ 57.82
432
+ Pure LLM
433
+ 35.81
434
+ (a)
435
+ Document:
436
+ Subject Entity:
437
+ Ground Truth Relation:
438
+       The Huntington Library, founded in 1919 by Henry 
439
+ Huntington, is one of the world 's greatest cultural, research 
440
+ and educational centers.
441
+ Huntington Library
442
+ Object Entity:
443
+ Subject Entity Type:
444
+ Object Entity Type:
445
+ Henry Huntington
446
+ Organization
447
+ Person
448
+ LLM-augmented RE Prediction:
449
+ RE-augmented LLM Prediction:
450
+ DSARE Prediction:
451
+ Document:
452
+ Subject Entity:
453
+ Ground Truth Relation:
454
+       Piedra testified he struggled to get his career going after 
455
+ graduating in 1998 from Tufts University School of 
456
+ Dental Medicine.
457
+ He
458
+ Object Entity:
459
+ Subject Entity Type:
460
+ Object Entity Type:
461
+ His
462
+ Person
463
+ Person
464
+ LLM-augmented RE Prediction:
465
+ RE-augmented LLM Prediction:
466
+ DSARE Prediction:
467
+ Document:
468
+ Subject Entity:
469
+ Ground Truth Relation:
470
+       "Our dad passed away when Emily was 17 and I was 18, '' 
471
+ says Sarah Kunstler, 33, who is also an attorney.
472
+ Sarah Kunstler
473
+ Object Entity:
474
+ Subject Entity Type:
475
+ Object Entity Type:
476
+ Emily
477
+ Person
478
+ Person
479
+ LLM-augmented RE Prediction:
480
+ RE-augmented LLM Prediction:
481
+ DSARE Prediction:
482
+ (b)
483
+ (c)
484
+ Fig. 3. The case study of the DSARE model. (a) is from the TACRED dataset (K=8),
485
+ while (b) and (c) are from the Re-TACRED dataset (K=8).
486
+ decreases between DSARE and its two variants, demonstrating the efficacy of
487
+ the Integrated Prediction module.
488
+ Subsequently, we further remove the LLM Data Augmentation part from
489
+ LLM-augmented RE and remove the KNN Demonstration part from RE-augmented
490
+ LLM. This yields two other variants, i.e., Pure RE and Pure LLM4. From the
491
+ results, both these variants perform inferiorly, especially the Pure LLM. These
492
+ findings futher demonstrate the validity and non-redundancy of our designs.
493
+ 5.4
494
+ Case Study
495
+ In this section, we conduct case study to more intuitively illustrate the effective-
496
+ ness of integrating traditional RE models and LLMs. Specifically, as illustrated
497
+ in Figure 3, we present the input information (i.e., document, subject/object
498
+ entity, subject/object entity type), ground truth relation and the prediction of
499
+ DSARE and its ablated variants, respectively.
500
+ In Figure 3 (a), both the LLM-augmented RE and the RE-augmented LLM
501
+ make the correct prediction. In Figure 3 (b) and (c), the LLM-augmented RE
502
+ and RE-augmented LLM correctly infer the relations (per : identity and per :
503
+ 4 Note that here Pure LLM is equivalent to the baseline ⑦Zephyr.
504
+ siblings), respectively. And with the aid of the Integrated Prediction module,
505
+ DSARE finally derives the correct predictions. These cases intuitively demon-
506
+ strate the significant role of integrating traditional RE methods and LLMs, and
507
+ further verify the validity of our DSARE model.
508
+ 6
509
+ Conclusions
510
+ In this paper, we explored a motivated direction for empowering few-shot relation
511
+ extraction with the integration of traditional RE models and LLMs. We first an-
512
+ alyzed the necessity to joint utilize traditional RE models and LLMs, and further
513
+ proposed a Dual-System Augmented Relation Extractor (DSARE). Specifically,
514
+ we designed a LLM-augmented RE module, which could inject the prior knowl-
515
+ edge of LLMs into the traditional RE models. Subsequently, a RE-augmented
516
+ LLM module was proposed to identify and retrieve the most valuable samples
517
+ from the training data, which provided more useful demonstrations for the In-
518
+ Context Learning of LLMs. More importantly, we designed an Integrated Predic-
519
+ tion module to joint consider the predictions of both LLM-augmented RE and
520
+ RE-augmented LLM modules, thus taking advantages of each other’s strengths
521
+ and deriving the final results. Finally, extensive experiments on three publicly
522
+ available datasets demonstrated the effectiveness of our proposed method. We
523
+ hope our work could lead to more future studies.
524
+ Acknowledgement. This research was partially supported by grants from
525
+ the National Natural Science Foundation of China (No. U20A20229), the An-
526
+ hui Provincial Natural Science Foundation of China (No. 2308085QF229 and
527
+ 2308085MG226) and the Fundamental Research Funds for the Central Univer-
528
+ sities.
529
+ References
530
+ 1. Alt, C., Gabryszak, A., Hennig, L.: Tacred revisited: A thorough evaluation of the
531
+ tacred relation extraction task. In: Proceedings of the 58th Annual Meeting of the
532
+ Association for Computational Linguistics. pp. 1558–1569 (2020)
533
+ 2. Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J.D., Dhariwal, P., Nee-
534
+ lakantan, A., Shyam, P., Sastry, G., Askell, A., et al.: Language models are few-shot
535
+ learners. Advances in neural information processing systems 33, 1877–1901 (2020)
536
+ 3. Chen, X., Zhang, N., Xie, X., Deng, S., Yao, Y., Tan, C., Huang, F., Si, L., Chen,
537
+ H.: Knowprompt: Knowledge-aware prompt-tuning with synergistic optimization
538
+ for relation extraction. In: Proceedings of the ACM Web conference 2022. pp.
539
+ 2778–2788 (2022)
540
+ 4. Gao, T., Fisch, A., Chen, D.: Making pre-trained language models better few-shot
541
+ learners. In: ACL-IJCNLP 2021. pp. 3816–3830 (2021)
542
+ 5. Gutiérrez, B.J., McNeal, N., Washington, C., Chen, Y., Li, L., Sun, H., Su, Y.:
543
+ Thinking about gpt-3 in-context learning for biomedical ie? think again. In: Find-
544
+ ings of EMNLP 2022. pp. 4497–4512 (2022)
545
+ 6. Han, J., Zhao, S., Cheng, B., Ma, S., Lu, W.: Generative prompt tuning for rela-
546
+ tion classification. In: Findings of the Association for Computational Linguistics:
547
+ EMNLP 2022. pp. 3170–3185 (2022)
548
+ 7. Han, X., Zhao, W., Ding, N., Liu, Z., Sun, M.: Ptr: Prompt tuning with rules for
549
+ text classification. AI Open 3, 182–192 (2022)
550
+ 8. Liu, J., Shen, D., Zhang, Y., Dolan, W.B., Carin, L., Chen, W.: What makes
551
+ good in-context examples for gpt-3? In: Proceedings of Deep Learning Inside Out
552
+ (DeeLIO 2022). pp. 100–114 (2022)
553
+ 9. Liu, Y., Wu, H., Huang, Z., Wang, H., Ning, Y., Ma, J., Liu, Q., Chen, E.: Techpat:
554
+ technical phrase extraction for patent mining. ACM Transactions on Knowledge
555
+ Discovery from Data 17(9), 1–31 (2023)
556
+ 10. Liu, Y., Zhang, K., Huang, Z., Wang, K., Zhang, Y., Liu, Q., Chen, E.: Enhancing
557
+ hierarchical text classification through knowledge graph integration. In: Findings
558
+ of the Association for Computational Linguistics: ACL 2023. pp. 5797–5810 (2023)
559
+ 11. Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M.,
560
+ Zettlemoyer, L., Stoyanov, V.: Roberta: A robustly optimized bert pretraining
561
+ approach. arXiv preprint arXiv:1907.11692 (2019)
562
+ 12. Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint
563
+ arXiv:1711.05101 (2017)
564
+ 13. Lyu, S., Chen, H.: Relation classification with entity type restriction. In: Findings
565
+ of ACL-IJCNLP 2021. pp. 390–395 (2021)
566
+ 14. OpenAI: Gpt-4 technical report (2023)
567
+ 15. Ouyang, L., Wu, J., Jiang, X., Almeida, D., Wainwright, C., Mishkin, P., Zhang, C.,
568
+ Agarwal, S., Slama, K., Ray, A., et al.: Training language models to follow instruc-
569
+ tions with human feedback. Advances in Neural Information Processing Systems
570
+ 35, 27730–27744 (2022)
571
+ 16. Stoica, G., Platanios, E.A., Póczos, B.: Re-tacred: Addressing shortcomings of the
572
+ tacred dataset. In: Proceedings of the AAAI Conference on Artificial Intelligence.
573
+ vol. 35, pp. 13843–13850 (2021)
574
+ 17. Touvron, H., Martin, L., Stone, K., Albert, P., Almahairi, A., Babaei, Y., Bash-
575
+ lykov, N., Batra, S., Bhargava, P., Bhosale, S., et al.: Llama 2: Open foundation
576
+ and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)
577
+ 18. Tunstall, L., Beeching, E., Lambert, N., Rajani, N., Rasul, K., Belkada, Y., Huang,
578
+ S., von Werra, L., Fourrier, C., Habib, N., et al.: Zephyr: Direct distillation of lm
579
+ alignment. arXiv preprint arXiv:2310.16944 (2023)
580
+ 19. Wan, Z., Cheng, F., Mao, Z., Liu, Q., Song, H., Li, J., Kurohashi, S.: Gpt-re: In-
581
+ context learning for relation extraction using large language models. arXiv preprint
582
+ arXiv:2305.02105 (2023)
583
+ 20. Xu, X., Zhu, Y., Wang, X., Zhang, N.: How to unleash the power of large language
584
+ models for few-shot relation extraction? In: Proceedings of The Fourth Workshop
585
+ on Simple and Efficient Natural Language Processing (SustaiNLP). pp. 190–200
586
+ (2023)
587
+ 21. Zhang, K., Zhang, K., Zhang, M., Zhao, H., Liu, Q., Wu, W., Chen, E.: Incorporat-
588
+ ing dynamic semantics into pre-trained language model for aspect-based sentiment
589
+ analysis. In: Findings of the Association for Computational Linguistics: ACL 2022.
590
+ pp. 3599–3610 (2022)
591
+ 22. Zhang, Y., Zhong, V., Chen, D., Angeli, G., Manning, C.D.: Position-aware atten-
592
+ tion and supervised data improve slot filling. In: Conference on Empirical Methods
593
+ in Natural Language Processing (2017)
594
+ 23. Zhou, W., Chen, M.: An improved baseline for sentence-level relation extraction.
595
+ AACL-IJCNLP 2022 p. 161 (2022)
output_files/semantic_chunk.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:275fffde1c5fea9575a7af7c996b3d9a771eb3cd65980b0e1c8bf7317813e738
3
+ size 1576075
output_files/semantic_chunk_bm25.index ADDED
Binary file (47 kB). View file
 
output_files/semantic_chunk_metadata.json ADDED
The diff for this file is too large to render. See raw diff
 
output_files/semantic_chunk_output.json ADDED
@@ -0,0 +1,972 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "chunk0",
4
+ "chunk": "激 光 生 物 学 报 ACTA LASER BIOLOGY SINICA Vol. 33 No. 5 Oct. 2024 第33 卷第5 期 2024 年10 月 收稿日期:2024-03-15",
5
+ "method": "semantic_chunk"
6
+ },
7
+ {
8
+ "id": "chunk1",
9
+ "chunk": "基金项目:国家重点研发计划项目(2022YFA0912500)",
10
+ "method": "semantic_chunk"
11
+ },
12
+ {
13
+ "id": "chunk2",
14
+ "chunk": "深圳市科技计划资助项目(RCBS20221008093108030, ZDSYS20230626090759006)",
15
+ "method": "semantic_chunk"
16
+ },
17
+ {
18
+ "id": "chunk3",
19
+ "chunk": "* 通信作者:王芳,研究员,主要从事微生物与合成生物学的研究",
20
+ "method": "semantic_chunk"
21
+ },
22
+ {
23
+ "id": "chunk4",
24
+ "chunk": "E-mail: [email protected]",
25
+ "method": "semantic_chunk"
26
+ },
27
+ {
28
+ "id": "chunk5",
29
+ "chunk": "合成微生物与水污染治理应用 林雅倩1,帅菲斐2,王 芳2* (1. 广西中医药大学,南宁 530200",
30
+ "method": "semantic_chunk"
31
+ },
32
+ {
33
+ "id": "chunk6",
34
+ "chunk": "2. 深圳市微生物基因组修饰编辑与应用重点实验室,重症医学科,深圳市 转化医学研究院,深圳市第二人民医院,深圳 518035) 摘 要:水污染是当今世界面临的严峻环境问题之一,影响人们的生活质量和经济的可持续发展",
35
+ "method": "semantic_chunk"
36
+ },
37
+ {
38
+ "id": "chunk7",
39
+ "chunk": "利用微生物 的降解能力以及与之相应的生物修复技术是去除环境污染物以及修复水环境的重要手段",
40
+ "method": "semantic_chunk"
41
+ },
42
+ {
43
+ "id": "chunk8",
44
+ "chunk": "但是,由于水污染环 境的复杂现状,传统微生物学治理方法受到了限制",
45
+ "method": "semantic_chunk"
46
+ },
47
+ {
48
+ "id": "chunk9",
49
+ "chunk": "运用合成生物学“设计-构建-测试-学习”的理念,对目标微 生物底盘进行从头设计、改造,从而获得对污染物具有高效、广谱降解能力的合成微生物,不仅可满足现在复杂污 水环境的治理要求,还能实现绿色、高效的循环治理,是未来合成生物学的发展应用方向之一",
50
+ "method": "semantic_chunk"
51
+ },
52
+ {
53
+ "id": "chunk10",
54
+ "chunk": "本文简要介绍了 水污染的现状、常用的治理技术以及当今合成微生物在水污染治理中的应用,重点介绍了合成微生物底盘的种类 以及通过底盘改造构建的合成微生物在水污染治理中的优势,以期通过合成生物学手段不断完善和优化污染物 降解和水污染治理技术,为实现水环境污染的高效治理提供参考",
55
+ "method": "semantic_chunk"
56
+ },
57
+ {
58
+ "id": "chunk11",
59
+ "chunk": "环境治理 中图分类号:X52 文献标志码:A DOI:10.3969/j.issn.1007-7146.2024.05.004 Application of Synthetic Microorganisms in Water Pollution Prevention and Control LIN Yaqian1, SHUAI Feifei2, WANG Fang2* (1. Guangxi University of Chinese Medicine, Nanning 530200, China; 2. Intensive Care Unit, Shenzhen Key Laboratory of Microbiology in Genomic Modification & Editing and Application, Shenzhen Institute of Translational Medicine, Shenzhen Second People′s Hospital, Shenzhen 518035, China) Abstract: Water pollution is one of the most serious environment problems that affect people′s life quality and sustainable economic development worldwide. The degradation capability of microorganisms endows them advantages in environmental pollutants elimination and water environments repairment, with the utilization of diverse bioremediation technologies. However, due to the complex situation of water pollution environment, traditional microbiological treatment methodology is restricted and limited. With the engineering principle Design-Build-Test-Learn in synthetic biology, the designed and modified synthetic mi- croorganisms with efficient and broad-spectrum degradation ability for pollutants has attracted the researchers′ attention. The synthetic microorganisms could not only meet the wastewater treatment requirements within complex pollutants, but also could achieve the green, efficient circular treatment goal. Therefore, design and construction of synthetic microorganisms for water pollution control has becoming one of the future development and application direction of synthetic biology. Herein, we intro- duced the current situation of water pollution, the treatment technologies classically utilized, and the synthetic microorganisms′ application in water pollution control. This review focuses on different types of microbial chassis in synthetic biology and the 419 第5 期 水是生命的起源,是生命赖以生存的根本,是 经济生产不可替代的基础,是维护生态环境的要 素",
60
+ "method": "semantic_chunk"
61
+ },
62
+ {
63
+ "id": "chunk12",
64
+ "chunk": "但随着科技的发展,大量环境污染物被排放, 水环境急速恶化,缺水已经成为人类日常生活面临 的主要问题之一[1]",
65
+ "method": "semantic_chunk"
66
+ },
67
+ {
68
+ "id": "chunk13",
69
+ "chunk": "据推断,到2050 年,全球一半 人口将面临严重的水资源短缺[1]",
70
+ "method": "semantic_chunk"
71
+ },
72
+ {
73
+ "id": "chunk14",
74
+ "chunk": "当前,我国生态 文明建设以降碳为重点战略方向,污水治理既是污 染防治的重要组成,也是温室气体减排的重要领 域",
75
+ "method": "semantic_chunk"
76
+ },
77
+ {
78
+ "id": "chunk15",
79
+ "chunk": "推进减污降碳协同增效已成为我国新发展阶 段经济社会发展全面绿色转型的必然选择",
80
+ "method": "semantic_chunk"
81
+ },
82
+ {
83
+ "id": "chunk16",
84
+ "chunk": "所以, 为了地球生态健康,也为了实现经济的可循环发 展,对水污染的治理刻不容缓",
85
+ "method": "semantic_chunk"
86
+ },
87
+ {
88
+ "id": "chunk17",
89
+ "chunk": "针对不同的水污染情况,可应用物理法、化学 法和生物法等治理方法,其中基于微生物降解转化 能力进行的生物法在实际治理中发挥着关键作用, 但是由于单一微生物降解易受多重环境因素影响 等限制,其作用范围和应用场景极其有限",
90
+ "method": "semantic_chunk"
91
+ },
92
+ {
93
+ "id": "chunk18",
94
+ "chunk": "随着合 成生物学作为“第三次生物科学革命”的发展,有 目的地设计、改造甚至重新合成构建生物体成为 可能,从而加速了特定污染物从无法降解到能够降 解、从低效降解到高效降解的转变[2],为污水的高 效治理带来了新方向",
95
+ "method": "semantic_chunk"
96
+ },
97
+ {
98
+ "id": "chunk19",
99
+ "chunk": "采用合成生物学使能技术,以工程化设计为理 念,对底盘微生物进行定向设计、改造、从头合成和 构建,最终构建可感知、报告并降解特定污染物的 合成微生物,是实现高效、绿色、经济、智能化治理 污水目标的未来方向",
100
+ "method": "semantic_chunk"
101
+ },
102
+ {
103
+ "id": "chunk20",
104
+ "chunk": "本文简述了水污染的不同 来源、现有的不同治理方案,并介绍了可用于水污 染治理的代表性合成微生物底盘种类及对其底盘 的改造构建方法,最后对合成微生物在水污染治理 中的优势进行了评述",
105
+ "method": "semantic_chunk"
106
+ },
107
+ {
108
+ "id": "chunk21",
109
+ "chunk": "1 水污染的现状 水是我们日常生活中最重要的一类资源",
110
+ "method": "semantic_chunk"
111
+ },
112
+ {
113
+ "id": "chunk22",
114
+ "chunk": "地 球上大约71% 的面积被水覆盖,其中97.5% 是海 水,只有2.5% 的淡水可供人类直接使用[3]",
115
+ "method": "semantic_chunk"
116
+ },
117
+ {
118
+ "id": "chunk23",
119
+ "chunk": "随着全 球经济和人口的快速增长,工业化、城市化步伐加 快,多种有害污染物进入环境中,由于环境污染物 的排放和水环境的恶化,水污染已成为全球面临并 且日益严峻的挑战[4]",
120
+ "method": "semantic_chunk"
121
+ },
122
+ {
123
+ "id": "chunk24",
124
+ "chunk": "污水中的大量重金属和有机 污染物,不仅严重威胁环境和生态平衡,而且危害 了人类健康,对经济增长和社会发展产生了显著的 负面影响[4-5]",
125
+ "method": "semantic_chunk"
126
+ },
127
+ {
128
+ "id": "chunk25",
129
+ "chunk": "目前水污染的主要来源有工业污染、 农业污染、医疗污染以及生活污染等(图1)",
130
+ "method": "semantic_chunk"
131
+ },
132
+ {
133
+ "id": "chunk26",
134
+ "chunk": "1.1 工业污水 随着全球工业化和城市化速度的加快,工业部 门用水增加,其用水量约占全球的22%,污水排放 量约占全球污水排放总量的28%[6-7]",
135
+ "method": "semantic_chunk"
136
+ },
137
+ {
138
+ "id": "chunk27",
139
+ "chunk": "常见的工业 水污染物包括汞(Hg)、镉(Cd)、砷(As)等有毒重金 属,石油以及氯化物等,根据化学性质可分为无机 污染物和有机污染物[5-6]",
140
+ "method": "semantic_chunk"
141
+ },
142
+ {
143
+ "id": "chunk28",
144
+ "chunk": "在众多的工业机构中, 来自造纸业、煤炭制造业、石油化工业、纺织业和制 药业的废水基本上是剧毒的,已被证实对周围环境 存在最高程度的危害性[8]",
145
+ "method": "semantic_chunk"
146
+ },
147
+ {
148
+ "id": "chunk29",
149
+ "chunk": "1.2 农业污水 全球人口正在以惊人的速度增长,预计到2050 年,全球粮食产量需要增加50%[9]",
150
+ "method": "semantic_chunk"
151
+ },
152
+ {
153
+ "id": "chunk30",
154
+ "chunk": "为了满足全 球的粮食需要,在农作物种植阶段,大量农药被采 用以提高农作物的产量和质量",
155
+ "method": "semantic_chunk"
156
+ },
157
+ {
158
+ "id": "chunk31",
159
+ "chunk": "基于不同的作用 目的,农药可分为除草剂、杀虫剂、灭鼠剂、杀菌剂 等,主要成分有硫磺、有毒重金属、生物碱和有机 磷等[9-10]",
160
+ "method": "semantic_chunk"
161
+ },
162
+ {
163
+ "id": "chunk32",
164
+ "chunk": "虽然是为了满足粮食需要,但是过度使 用农药会破坏生态系统、损害水质并造成人体损 伤[11]",
165
+ "method": "semantic_chunk"
166
+ },
167
+ {
168
+ "id": "chunk33",
169
+ "chunk": "农药可通过与土壤吸附结合,破坏土壤团粒 结构,并从中渗入地下水,造成地下水与地表水的 污染",
170
+ "method": "semantic_chunk"
171
+ },
172
+ {
173
+ "id": "chunk34",
174
+ "chunk": "4,4′- 二氯二苯基二氯乙烷、毒死蜱(chlorpy- rifos,CP)和二嗪农等一些可溶性、挥发性农药常在 空气中富集,在降雨时,会重新汇集并随着地表水 渗透到土壤、地下水之中,从而加剧水污染的严重 程度[11-12]",
175
+ "method": "semantic_chunk"
176
+ },
177
+ {
178
+ "id": "chunk35",
179
+ "chunk": "merits of application of synthetic microorganisms in water pollution control. This review will facilitate the performance optimi- zation of pollutant degradation and wastewater treatment via synthetic biology and provide reference for the efficient treatment of wastewater environment. Key words: synthetic microorganisms; microbial chassis; water pollution; bioremediation technology; environmental gover- nance (Acta Laser Biology Sinica, 2024, 33(5): 418-431) 林雅倩等:合成微生物与水污染治理应用 林雅倩等:合成微生物与水污染治理应用 激 光 生 物 学 报 420 第 33 卷 1.3 医疗废水 医疗废水主要来源于医院,其成分十分复杂, 包含各种病原微生物、抗生素、有毒有机化合物、放 射性元素和离子污染物等[13]",
180
+ "method": "semantic_chunk"
181
+ },
182
+ {
183
+ "id": "chunk36",
184
+ "chunk": "根据科室不同,医疗 系统产生的污水成分也不尽相同",
185
+ "method": "semantic_chunk"
186
+ },
187
+ {
188
+ "id": "chunk37",
189
+ "chunk": "如放射科污水 中存在造影剂,手术室排出的污水含大量麻醉类药 物[14-16]",
190
+ "method": "semantic_chunk"
191
+ },
192
+ {
193
+ "id": "chunk38",
194
+ "chunk": "总体来说,医疗废水具有成分复杂、难降 解甚至传染性等特点",
195
+ "method": "semantic_chunk"
196
+ },
197
+ {
198
+ "id": "chunk39",
199
+ "chunk": "值得强调的是,当今大多数 医疗废水往往没有经过特殊处理就与生活污水一 起排入了污水处理厂[17-18],这就使许多药物和微生 物在不发生结构和毒性明显变化的情况下,通过污 水处理厂并进入了地表水[13],从而进一步导致了抗 生素耐药细菌的产生和传播[18]",
200
+ "method": "semantic_chunk"
201
+ },
202
+ {
203
+ "id": "chunk40",
204
+ "chunk": "1.4 生活污水 生活污水由70% 的有机物和30% 的无机物组 成,通常分为两大类,即黑水和灰水[19-20]",
205
+ "method": "semantic_chunk"
206
+ },
207
+ {
208
+ "id": "chunk41",
209
+ "chunk": "黑水是 厕所排放的富含有机物、氮和磷的污水,主要有粪 便、尿液和食物残渣等",
210
+ "method": "semantic_chunk"
211
+ },
212
+ {
213
+ "id": "chunk42",
214
+ "chunk": "灰水是用于洗涤或洗澡的 废水,包括来自厨房水槽、淋浴间和洗衣机的废 水[19, 21-22],主要含有酸碱性化合物、脂肪、重金属、 硝酸盐和异生化合物等[22]",
215
+ "method": "semantic_chunk"
216
+ },
217
+ {
218
+ "id": "chunk43",
219
+ "chunk": "生活中,灰水产量可达 黑水产量的1~7 倍[19-20]",
220
+ "method": "semantic_chunk"
221
+ },
222
+ {
223
+ "id": "chunk44",
224
+ "chunk": "鉴于当今许多污水处理 厂的处理能力欠缺,大量生活废水流入地下水和地 表水中,导致水质恶化,影响水生生物的生存",
225
+ "method": "semantic_chunk"
226
+ },
227
+ {
228
+ "id": "chunk45",
229
+ "chunk": "2 现有水污染治理技术 目前用于治理水污染的方法主要可分为物理 方法、化学方法、生物学方法(图1),可根据污染物 浓度、污水组成、工艺成本或污水中存在的额外杂 质,选择合适的治理方法[23]",
230
+ "method": "semantic_chunk"
231
+ },
232
+ {
233
+ "id": "chunk46",
234
+ "chunk": "2.1 物理法 物理法是指借助物理作用来处理、回收水体中 的污染物质,通常需要在不影响污染物生化特性的 情况下进行",
235
+ "method": "semantic_chunk"
236
+ },
237
+ {
238
+ "id": "chunk47",
239
+ "chunk": "其中吸附法因其灵活性、广泛的适用性、成本效 益高和实用性,成为污水治理中最有利的方法[24]",
240
+ "method": "semantic_chunk"
241
+ },
242
+ {
243
+ "id": "chunk48",
244
+ "chunk": "常见的吸附剂有生物吸附剂、二氧化硅、氧化铝、活 性炭、黏土、金属氧化物、二氧化钛等[23]",
245
+ "method": "semantic_chunk"
246
+ },
247
+ {
248
+ "id": "chunk49",
249
+ "chunk": "由于吸附 剂饱合能力有限、难以再生,且某些污染物难以通 过吸附被去除[4],限制了吸附法的实际应用",
250
+ "method": "semantic_chunk"
251
+ },
252
+ {
253
+ "id": "chunk50",
254
+ "chunk": "但是, 已有研究通过合成新型纳米复合吸附剂,成功去除 污水中的苯并吡啶、氧氟沙星等污染物[25-26]",
255
+ "method": "semantic_chunk"
256
+ },
257
+ {
258
+ "id": "chunk51",
259
+ "chunk": "图1 水污染的来源、组成及治理 Fig. 1 Sources, types, and treatment strategies of water pollution 421 第5 期 2.2 化学法 化学法是指在进行污水治理时,借助化学反应 来治理留存于水体中污染物质的方法",
260
+ "method": "semantic_chunk"
261
+ },
262
+ {
263
+ "id": "chunk52",
264
+ "chunk": "化学处理 方法主要有氧化还原法、化学沉淀法、混凝和离子 交换等,通常用来处理一些无机物质和部分难以降 解的有机污染物",
265
+ "method": "semantic_chunk"
266
+ },
267
+ {
268
+ "id": "chunk53",
269
+ "chunk": "生活中常常通过化学法,如臭氧 和氯、次氯酸钠等来初步消除医院污水中的细菌和 病毒[13,27]",
270
+ "method": "semantic_chunk"
271
+ },
272
+ {
273
+ "id": "chunk54",
274
+ "chunk": "采用氧化还原原理的高级氧化工艺(ad- vanced oxidation process,AOP)是去除顽固有机污染 物并灭活传统技术无法处理的病原微生物的一种 极具潜力的技术,其作用方式有氢提取、自由基的 组合或加成以及电子转移[28-29]",
275
+ "method": "semantic_chunk"
276
+ },
277
+ {
278
+ "id": "chunk55",
279
+ "chunk": "已有研究通过电化 学氧化- 硫酸盐AOP 成功降解了化工企业高浓度含 氰有机废水中的有机物和氰化物[30]",
280
+ "method": "semantic_chunk"
281
+ },
282
+ {
283
+ "id": "chunk56",
284
+ "chunk": "2.3 生物法 生物法是利用微生物的代谢能力对污水中 的有机物质进行分解的方法",
285
+ "method": "semantic_chunk"
286
+ },
287
+ {
288
+ "id": "chunk57",
289
+ "chunk": "由于成本效益高和 环境相容性好,生物修复具有极好的未来发展潜 力[3,31]",
290
+ "method": "semantic_chunk"
291
+ },
292
+ {
293
+ "id": "chunk58",
294
+ "chunk": "生物法通过使用细菌、真菌、微藻、酵母和 其他微生物菌群进行,由于体积小、比表面积大,它 们已成为当今理想的生物修复制剂[32]",
295
+ "method": "semantic_chunk"
296
+ },
297
+ {
298
+ "id": "chunk59",
299
+ "chunk": "生物法可 通过吸附或积累来降解污水中的各种污染物[28]",
300
+ "method": "semantic_chunk"
301
+ },
302
+ {
303
+ "id": "chunk60",
304
+ "chunk": "目前,已有研究使用微生物单一培养法对污水中的 染料进行降解和脱色[33],以及采用高效藻类塘去除 污水中的抗高血压药、抗抑郁药和消炎药等[34]",
305
+ "method": "semantic_chunk"
306
+ },
307
+ {
308
+ "id": "chunk61",
309
+ "chunk": "以上污水治理方法各有千秋,现有的许多废水 处理研究采用组合生物、化学和/ 或物理处理的技 术来应对传统单一处理技术的局限性[5]",
310
+ "method": "semantic_chunk"
311
+ },
312
+ {
313
+ "id": "chunk62",
314
+ "chunk": "物理和化 学方法治理的缺点是价格昂贵、效率低、易产生副 产物,如污泥和次级代谢物等[3,35]",
315
+ "method": "semantic_chunk"
316
+ },
317
+ {
318
+ "id": "chunk63",
319
+ "chunk": "在某些情况下, 吸附、化学沉淀或电化学沉淀等方法不能从废水中 去除染料或其代谢物等化合物,只能将污染物从一 相转移到另一相,并未从根本上解决污染问题[35]",
320
+ "method": "semantic_chunk"
321
+ },
322
+ {
323
+ "id": "chunk64",
324
+ "chunk": "与物理和化学方法相比,生物处理方法虽具有环 境友好、高效且可行性高等优势[5],但易受温度和 pH 值等环境因素的影响,导致污染物处理速度变 慢[4,32]",
325
+ "method": "semantic_chunk"
326
+ },
327
+ {
328
+ "id": "chunk65",
329
+ "chunk": "组合处理方法在实施时也存在多个限制, 诸如缺乏技术知识和数据的可行性,处理材料的效 率随着时间的推移而降低等[5],且大多数研究都是 使用模拟或稀释的废水进行的,并不能代表实际污 水的发生情况[35],所以在水污染治理中,应该结合 实际情况具体分析",
330
+ "method": "semantic_chunk"
331
+ },
332
+ {
333
+ "id": "chunk66",
334
+ "chunk": "3 合成微生物在水污染治理中的应用 合成生物学是一门集合了生物化学、分子生物 学和计算机等多学科的新兴学科,通过分子生物学 工具,对生物体的遗传信息重新编辑使其具有特定 的功能[2,36]",
335
+ "method": "semantic_chunk"
336
+ },
337
+ {
338
+ "id": "chunk67",
339
+ "chunk": "近年来,随着合成生物学的发展,现有 技术已可将复杂的外源性代谢途径引入特定的微 生物宿主,并进行特定的人为改造,以实现特定的 目标",
340
+ "method": "semantic_chunk"
341
+ },
342
+ {
343
+ "id": "chunk68",
344
+ "chunk": "合成生物学方法在一定程度上解决了微生 物中只含有单一化合物的分解代谢基因这一缺陷, 并使微生物可以监测、聚集和降解环境污染物[2,37], 从而使合成生物学在水污染治理中受到广泛应用",
345
+ "method": "semantic_chunk"
346
+ },
347
+ {
348
+ "id": "chunk69",
349
+ "chunk": "采用合成生物学“设计- 构建- 测试- 学习”的 理念,对生物元件挖掘、分析,设计构建而成的具有 特定功能的微生物,称为合成微生物(图2)",
350
+ "method": "semantic_chunk"
351
+ },
352
+ {
353
+ "id": "chunk70",
354
+ "chunk": "底盘 作为引入具有不同代谢途径元件的宿主,其开发和 构建是合成生物学领域的一个关键,也是构建合成 微生物的基础[38]",
355
+ "method": "semantic_chunk"
356
+ },
357
+ {
358
+ "id": "chunk71",
359
+ "chunk": "底盘细胞的普遍含义是改善了 细菌、古细菌或植物细胞中遗传信息的宿主,也是 将合成的功能化元件、线路和途径等体系置入其中 以达到设计目标的重要合成生物学反应平台[39]",
360
+ "method": "semantic_chunk"
361
+ },
362
+ {
363
+ "id": "chunk72",
364
+ "chunk": "在合成生物学中,底盘为运行合成系统提供能 量,被认为是合成生物学的“硬件”,而生物部件和 生物设备则被认为是“软件”在底盘中“加载并运 行”",
365
+ "method": "semantic_chunk"
366
+ },
367
+ {
368
+ "id": "chunk73",
369
+ "chunk": "合成生物学被认为是实现在“硬件”底盘中 “即插即用”的模块化“软件”的生物学[40](图2)",
370
+ "method": "semantic_chunk"
371
+ },
372
+ {
373
+ "id": "chunk74",
374
+ "chunk": "在对一个底盘操作前,应了解其基因组序列, 并具有先进的遗传工具可对其进行深度编辑",
375
+ "method": "semantic_chunk"
376
+ },
377
+ {
378
+ "id": "chunk75",
379
+ "chunk": "这 些遗传工具包括表达载体、启动子和先进的基因组 编辑技术",
380
+ "method": "semantic_chunk"
381
+ },
382
+ {
383
+ "id": "chunk76",
384
+ "chunk": "同时一个合格的底盘应具有遗传和进 化稳定性、在目的环境中的可用性等特征[39]",
385
+ "method": "semantic_chunk"
386
+ },
387
+ {
388
+ "id": "chunk77",
389
+ "chunk": "目前 应用在水污染治理中的底盘细胞主要有大肠杆菌、 恶臭假单胞菌、枯草芽孢杆菌、酿酒酵母等模式微 生物,以及盐单胞菌(Halomonas spp.)、拜氏不动杆 菌和需钠弧菌等非模式微生物[38,41](图3)(表1)",
390
+ "method": "semantic_chunk"
391
+ },
392
+ {
393
+ "id": "chunk78",
394
+ "chunk": "3.1 基于大肠杆菌 大肠杆菌(Escherichia coli)是一种革兰氏阴性 杆状细菌,被归类为肠杆菌科,该菌主要寄生在哺 乳动物肠道中,可通过粪便和废水在环境中传播, 造成污染[42]",
395
+ "method": "semantic_chunk"
396
+ },
397
+ {
398
+ "id": "chunk79",
399
+ "chunk": "由于大肠杆菌具有生长快速、培养基 简单和经济效应高等优势,在合成生物学中常作为 模式微生物底盘被研究和使用[38, 40]",
400
+ "method": "semantic_chunk"
401
+ },
402
+ {
403
+ "id": "chunk80",
404
+ "chunk": "林雅倩等:合成微生物与水污染治理应用 林雅倩等:合成微生物与水污染治理应用 激 光 生 物 学 报 422 第 33 卷 图2 合成微生物的构建流程 Fig. 2 Work flow of the biosynthetic microorganisms construction 图3 基于合成微生物的水污染治理 Fig. 3 Construction and application of the synthetic microorganisms in water pollution 鉴于已有多个基因编辑技术可在大肠杆菌中 高效实施,如锌指核酸酶技术(zinc finger nuclease, ZFN)、转录激活因子样效应因子核酸酶技术(tran- scription activator-like effector nuclease,TALEN)、同源重 组和成簇的规则间隔短回文重复序列系统[clustered regularly interspaced short palindromic repeats (CRISPR)/ CRISPR-associated (Cas) protein,CRISPR-Cas]等[43],目 前大量研究以大肠杆菌为底盘进行蛋白质和代谢物 的生物合成,如血红素蛋白和5- 氨基乙酰丙酸的合 成等[44-46]",
405
+ "method": "semantic_chunk"
406
+ },
407
+ {
408
+ "id": "chunk81",
409
+ "chunk": "CRISPR-Cas 系统是一种天然免疫系统, 广泛分布于细菌和古细菌中[47-48]",
410
+ "method": "semantic_chunk"
411
+ },
412
+ {
413
+ "id": "chunk82",
414
+ "chunk": "���传统的低通量 ZFN和TALEN相比,CRISPR是一种可编程的下一代 高通量基因编辑技术,可以直接将一组首选指令传 输到微生物的基因组中,大量水污染相关研究采用 该体系构建合成微生物",
415
+ "method": "semantic_chunk"
416
+ },
417
+ {
418
+ "id": "chunk83",
419
+ "chunk": "Li 等[47]基于CRISPR-Cas9 构建的pEcCas/pEcgRNA 系统可以实现E. coli BL21 423 第5 期 (DE3)的高效基因组编辑,并将此系统用于更广泛 的大肠杆菌和其他肠杆菌科物种中",
420
+ "method": "semantic_chunk"
421
+ },
422
+ {
423
+ "id": "chunk84",
424
+ "chunk": "Zhu等[49]基于 CRISPR-Cas12a 系统实现了精确的DNA 靶向和切割 功能,成功建立了RAA-CRISPR-Cas12a检测系统,实 现了对食源性和水源性大肠杆菌病原体E. coli O157: H7的快速、特异和高灵敏检测",
425
+ "method": "semantic_chunk"
426
+ },
427
+ {
428
+ "id": "chunk85",
429
+ "chunk": "在污染物降解方面,Wang 等[50]通过对从苯酚 降解细菌红球菌(Rhodococcus)和恶臭假单胞菌 (Pseudomonas putida)中提取的pheA1、catA和pcaI 等基因,进行一系列优化(密码子优化、GC 含量平 衡、去除不必要的切割位点等)和整合操作,使其在 E. coli BL221-AI 内表达,构建的重组菌株能迅速降 解废水中的苯酚",
430
+ "method": "semantic_chunk"
431
+ },
432
+ {
433
+ "id": "chunk86",
434
+ "chunk": "对来源于红球菌的tfdA、pcaJ 和 pcaF 等9 个2,4- 二氯苯氧乙酸(2,4-dichlorophen- oxyacetic acid,2,4-D)降解基因,进行上述类似的优 化整合操作,Wang 等[51]构建的重组E. coli BL-3164 可以实现以2,4-D 为唯一碳源的生长,降低环境中 2,4-D含量的有效目标",
435
+ "method": "semantic_chunk"
436
+ },
437
+ {
438
+ "id": "chunk87",
439
+ "chunk": "3.2 基于恶臭假单胞菌 恶臭假单胞菌(Pseudomonas putida)是一种革 兰氏阴性杆状细菌,属假单胞菌属(Pseudomonas), 作为模式底盘在工业和环境生物技术中都得到了 广泛应用[52-54]",
440
+ "method": "semantic_chunk"
441
+ },
442
+ {
443
+ "id": "chunk88",
444
+ "chunk": "恶臭假单胞菌具有强大的氧化还原 代谢能力,对许多不同类型的物理化学损伤具有耐 受性,能在恶劣的环境中生长且具有异种生物降解 的能力,且能从受污染的地区被分离与培养[53, 55], 这激发了研究人员对恶臭假单胞菌作为底盘降解 污染物的研究兴趣",
445
+ "method": "semantic_chunk"
446
+ },
447
+ {
448
+ "id": "chunk89",
449
+ "chunk": "目前已经有大量分子工具可用于恶臭假单胞 菌遗传操作和代谢编程[53,55],如反选择标记(coun- terselectable marker,CSM)、异源重组酶和自杀载体, 这些分子工具已被验证可促进该属细菌的等位基 因交换[54]",
450
+ "method": "semantic_chunk"
451
+ },
452
+ {
453
+ "id": "chunk90",
454
+ "chunk": "Aparicio 等[56]将ssDNA 重组与CRISPR- Cas9 联合应用在恶臭假单胞菌中,开发了一种高 效、快速的基因编辑方案",
455
+ "method": "semantic_chunk"
456
+ },
457
+ {
458
+ "id": "chunk91",
459
+ "chunk": "Liang 等[57]构建并优化 了恶臭假单胞菌中的T7 样表达系统,提供了一套 适用的底盘和相应的质粒提高重组表达水平,可用 于表达其他底盘中难以表达的蛋白质",
460
+ "method": "semantic_chunk"
461
+ },
462
+ {
463
+ "id": "chunk92",
464
+ "chunk": "此外,各种 组成型启动子,多种天然及合成的诱导启动子也已 经在恶臭假单胞菌中被发现[53]",
465
+ "method": "semantic_chunk"
466
+ },
467
+ {
468
+ "id": "chunk93",
469
+ "chunk": "在污染物降解方面,采用基于upp 基因作为 CSM 的基因组编辑方法,Gong 等[58]将多个降解 基因整合到P. putida KT2440 染色体中,构建的合 成菌株具有降解农药甲基对硫磷(methyl parathi- on,MP)和γ- 六氯环己烷(γ-hexachlorocyclohexane, γ-HCH)的能力",
470
+ "method": "semantic_chunk"
471
+ },
472
+ {
473
+ "id": "chunk94",
474
+ "chunk": "Liang 等[59]通过自杀质粒pK- 18mobsacB 构建删除了GIs 的P. putida KT2440,所 产生的突变体KTU-U13 不仅表现出较高的质粒转 化效率和异源蛋白表达能力,且作为受体细胞时, 对γ-HCH 和1,2,3- 三氯丙烷(1,2,3-trichloropro- pane,TCP)的降解效率也显著增强",
475
+ "method": "semantic_chunk"
476
+ },
477
+ {
478
+ "id": "chunk95",
479
+ "chunk": "3.3 基于枯草芽孢杆菌 枯草芽孢杆菌(Bacillus subtilis)是革兰氏阳性 芽孢杆菌,其外源蛋白具有分泌能力强、遗传可操 作性强和易培养等优势,已被广泛用作底盘生产生 物聚合物、工业酶制剂等[52, 60-61]",
480
+ "method": "semantic_chunk"
481
+ },
482
+ {
483
+ "id": "chunk96",
484
+ "chunk": "同时,枯草牙孢杆 菌主要分布于土壤及腐���的有机物中,具备作为合 成微生物底盘应用于污染物治理的潜在应用价值",
485
+ "method": "semantic_chunk"
486
+ },
487
+ {
488
+ "id": "chunk97",
489
+ "chunk": "常应用于枯草牙孢杆菌的基因编辑手段有 CSM,位点特异性重组系统、CRISPR-Cas9 及其衍生 的CRISPRi和CRISPR-Cpf1系统[62]",
490
+ "method": "semantic_chunk"
491
+ },
492
+ {
493
+ "id": "chunk98",
494
+ "chunk": "CRISPR-Cas9可 用于枯草牙孢杆菌的基因突变、缺失和插入等,是 枯草牙孢杆菌基因遗传操作中最强大的工具之一",
495
+ "method": "semantic_chunk"
496
+ },
497
+ {
498
+ "id": "chunk99",
499
+ "chunk": "近期,Ferrando 等[60]基于CRISPR-Cas9 的新型基因 组编辑开发了一种简单、快速、方便的方法,可在枯 草牙孢杆菌中同时插入三个基因,这为枯草牙孢杆 菌作为微生物底盘的未来发展提供了良好的前景",
500
+ "method": "semantic_chunk"
501
+ },
502
+ {
503
+ "id": "chunk100",
504
+ "chunk": "Zhu 等[63]通过敲除枯草牙孢杆菌突变株ZN0871v11 中的yqfY和spo0A基因,所获得的菌株GEBS可同时 吸附废水中的有机物、阳离子和阴离子,并在几分 钟内实现共沉淀,从而达到净化水质、废水回收的 目标",
505
+ "method": "semantic_chunk"
506
+ },
507
+ {
508
+ "id": "chunk101",
509
+ "chunk": "Ghataora 等[61]通过整合源自革兰氏阴性菌的 可变金属结合域与在枯草牙孢杆菌中起作用的DNA 结合结构域,并辅以结构引导设计,在枯草牙孢杆菌 中生成了具有金属敏感性的生物传感器模块",
510
+ "method": "semantic_chunk"
511
+ },
512
+ {
513
+ "id": "chunk102",
514
+ "chunk": "此外,枯草牙孢杆菌的另一特性是能形成复杂 而强大的生物膜[62, 64]",
515
+ "method": "semantic_chunk"
516
+ },
517
+ {
518
+ "id": "chunk103",
519
+ "chunk": "生物膜是一种由紧密结合的 细菌组成的结构群落,被自身产生的细胞外基质包 裹,这种基质允许细菌附着在表面,使生物膜可抵 抗氧化应激等环境条件,因此生物膜可应用于生物 修复技术[65-66]",
520
+ "method": "semantic_chunk"
521
+ },
522
+ {
523
+ "id": "chunk104",
524
+ "chunk": "合成生物学的进步使人们能够对生 物膜进行重新编程,以改善其功能或提高增值产品 的产量[65]",
525
+ "method": "semantic_chunk"
526
+ },
527
+ {
528
+ "id": "chunk105",
529
+ "chunk": "利用含有枯草牙孢杆菌的生物膜或多 物种生物膜组合的生物反应器,可以实现废水和周 林雅倩等:合成微生物与水污染治理应用 林雅倩等:合成微生物与水污染治理应用 激 光 生 物 学 报 424 第 33 卷 围环境中有毒污染物降解的目标",
530
+ "method": "semantic_chunk"
531
+ },
532
+ {
533
+ "id": "chunk106",
534
+ "chunk": "Li 等[67]将有机 腈降解过程中发挥关键作用的腈水合酶和酰胺酶 的编码基因nha、ami整合至B. subtilis N4中,构建了 B. subtilis N4/pHTnha-ami,可实现废水中有机腈的降 解,同时,联合使用带正电荷的改性聚乙烯载体移动 床生物膜反应器,促进了细菌的黏附和生物膜的形 成",
535
+ "method": "semantic_chunk"
536
+ },
537
+ {
538
+ "id": "chunk107",
539
+ "chunk": "Zhu等[68]通过整合型质粒pDG1730构建了能传 感和吸附重金属离子的基因回路,并转至B. subtilis 1935ΔepsΔbslA 底盘中,所形成的菌株不仅能实时 感应三种金属离子[Pb(II)、Hg(II)和Cu(II)]且形成 的生物膜能实现对这些金属离子的特异性吸附",
540
+ "method": "semantic_chunk"
541
+ },
542
+ {
543
+ "id": "chunk108",
544
+ "chunk": "3.4 基于酿酒酵母 酿酒酵母(Saccharomyces cerevisiae)是一种真 核模式生物,具有遗传背景明确、生长迅速、操作简 单等优点[69-70]",
545
+ "method": "semantic_chunk"
546
+ },
547
+ {
548
+ "id": "chunk109",
549
+ "chunk": "酿酒酵母与原核模式微生物相比有 许多独特的优势,如可增加特异性酶的表达和翻译 后修饰能力,为特异性酶的表达提供优越的环境, 以及具有细胞器,可以分隔产物的合成,减轻产物 积累导致的细胞代谢负担等[52]",
550
+ "method": "semantic_chunk"
551
+ },
552
+ {
553
+ "id": "chunk110",
554
+ "chunk": "因此,酿酒酵母成 为了用于生产燃料、食品添加剂和生物制药方面的 优质底盘候选物[70-71]",
555
+ "method": "semantic_chunk"
556
+ },
557
+ {
558
+ "id": "chunk111",
559
+ "chunk": "此外,基于酵母细胞对pH值 和温度变化的高适应性,酿酒酵母也成为有毒污染 物生物传感器应用中的合适底盘[72]",
560
+ "method": "semantic_chunk"
561
+ },
562
+ {
563
+ "id": "chunk112",
564
+ "chunk": "但是,酿酒酵母和非生产性程序的内源性因素 常常损害外源基因的表达和插入,因此,目前开发 了多种技��用来增强和提高其细胞内基因转录的 强度和效率并促进基因编辑的程序[69]",
565
+ "method": "semantic_chunk"
566
+ },
567
+ {
568
+ "id": "chunk113",
569
+ "chunk": "其中CRIS- PR-Cas9 系统已成功应用于多种酵母细胞的基因编 辑中,如删除细胞特定代谢过程中的相关基因,以 及在一步反应中同时进行多基因敲除、下调和过表 达等[73]",
570
+ "method": "semantic_chunk"
571
+ },
572
+ {
573
+ "id": "chunk114",
574
+ "chunk": "酿酒酵母中单一向导RNA(sgRNA)的最 广泛利用提高了CRISPR-Cas9 系统基因编辑的效 率[69]",
575
+ "method": "semantic_chunk"
576
+ },
577
+ {
578
+ "id": "chunk115",
579
+ "chunk": "Fan 等[74]通过优化半乳糖诱导系统的关键 元件Gal4 和Gal80,并利用CRISPR-Cas9 介导的基 因组编辑技术将其分别连接至铜诱导启动子CUP1 和铜抑制启动子CTR1 中,合成的酵母细胞可用于 高效检测环境中的Cu(II)",
580
+ "method": "semantic_chunk"
581
+ },
582
+ {
583
+ "id": "chunk116",
584
+ "chunk": "在污染物降解方面,通过把细菌lacZ 报告基 因与含酵母JLP1 启动子的质粒相连,Ito-Harashima 等[75]构建了一个携带高拷贝数pESC-JLP1-lacZ 报 告质粒的菌株,并将其用于高效、快速监测水和土 壤中的Cd 污染",
585
+ "method": "semantic_chunk"
586
+ },
587
+ {
588
+ "id": "chunk117",
589
+ "chunk": "Ruta 等[76]将多种植物金属硫蛋白 分别靶向酵母质膜的内表面,改造后的酿酒酵母可 在低浓度下吸附Cu(II),且表达myrGFP-NcMT3 的 菌株可在高浓度Cu(II)、Ni(II)和Mn(II)等金属离 子条件下表现出强劲的生长力、金属耐受性和高累 积能力",
590
+ "method": "semantic_chunk"
591
+ },
592
+ {
593
+ "id": "chunk118",
594
+ "chunk": "Mashangoane 等[77]将钯结合肽(4R-PdBP) 连接至pYD5 质粒上,形成重组质粒pYD5/4R-PdBP 并转化至S. cerevisiae EBY100 中,构建的重组菌株 可从水溶液中生物吸附钯[Pd(II)]",
595
+ "method": "semantic_chunk"
596
+ },
597
+ {
598
+ "id": "chunk119",
599
+ "chunk": "3.5 基于非模式底盘 模式底盘微生物研究时间长,研究人员多,遗 传工具丰富,目前已经得到了大量的研究和应用, 在微生物遗传与合成生物学的基础理论研究以及 应用中发挥着举足轻重的作用",
600
+ "method": "semantic_chunk"
601
+ },
602
+ {
603
+ "id": "chunk120",
604
+ "chunk": "但在实际应用中, 模式底盘受环境影响较大,往往不能达到预期效 果,这时,具备底盘所需特性的一些非模式底盘微 生物吸引了研究学者的目光",
605
+ "method": "semantic_chunk"
606
+ },
607
+ {
608
+ "id": "chunk121",
609
+ "chunk": "在水污染治理中,能 用于降解污染物的非模式底盘有盐单胞菌、不动杆 菌和需钠弧菌等",
610
+ "method": "semantic_chunk"
611
+ },
612
+ {
613
+ "id": "chunk122",
614
+ "chunk": "3.5.1 盐单胞菌 盐单胞菌是革兰氏阴性需氧细菌,属于嗜盐菌 家族,常见于盐湖、沼泽及海洋等高盐环境中",
615
+ "method": "semantic_chunk"
616
+ },
617
+ {
618
+ "id": "chunk123",
619
+ "chunk": "该 菌不仅能够耐受含3%~15% NaCl 的高盐环境,而 且能在50℃的高温以及pH 为10 的高碱性环境下 存活[2, 78]",
620
+ "method": "semantic_chunk"
621
+ },
622
+ {
623
+ "id": "chunk124",
624
+ "chunk": "高盐环境也更有利于盐单胞菌降解合成 染料废水中的污染物[79]",
625
+ "method": "semantic_chunk"
626
+ },
627
+ {
628
+ "id": "chunk125",
629
+ "chunk": "由于其培养简单、生长快 速和污染抵抗性高等优势,目前盐单胞菌已被用作 底盘生产聚羟基烷酸、某些蛋白质以及小分子有机 化合物等[78, 80]",
630
+ "method": "semantic_chunk"
631
+ },
632
+ {
633
+ "id": "chunk126",
634
+ "chunk": "随着对盐单胞菌研究兴趣的增加,多株盐单胞 菌的全基因信息已被揭示,如Halomonas spp. MS1、 Halomonas 11-S5、Halomonas 25-S5、Halomonas spp. SpR1 和SpR8 等[81-83]",
635
+ "method": "semantic_chunk"
636
+ },
637
+ {
638
+ "id": "chunk127",
639
+ "chunk": "盐单胞菌株基因组测序的完 成和注释信息完善促进了学者对盐单胞菌内源基因 功能挖掘、代谢通路鉴定、代谢网络构建等方面的 研究,为构建盐单胞菌底盘提供了理论支持[78]",
640
+ "method": "semantic_chunk"
641
+ },
642
+ {
643
+ "id": "chunk128",
644
+ "chunk": "目 前,针对盐单胞菌还开发了一系列工具和元件,如 pSEVA系列质粒、pUCpHAw和pHA1AT_32等穿梭载 体、多种组成型和诱导型启动子、不依赖于Rho的固 有终止子以及一种新型的类T7表达系统等,奠定了 开展盐单胞菌使能技术的基础[78, 84-86]",
645
+ "method": "semantic_chunk"
646
+ },
647
+ {
648
+ "id": "chunk129",
649
+ "chunk": "另外,上述 在模式底盘中建立的大部分基因编辑技术,包括同 源重组、CRISPR-Cas9 和CRISPRi 等,在盐单胞菌中 425 第5 期 也同样适用[85]",
650
+ "method": "semantic_chunk"
651
+ },
652
+ {
653
+ "id": "chunk130",
654
+ "chunk": "最近,利用CRISPR-Cas9 基因编辑 技术与非同源末端连接修复系统相结合,Liu 等[80] 开发了一种快速、高效删除盐单胞菌中大片段DNA 的方法",
655
+ "method": "semantic_chunk"
656
+ },
657
+ {
658
+ "id": "chunk131",
659
+ "chunk": "以上元件和技术为盐单胞菌应用于合成生 物学底盘构建奠定了基础",
660
+ "method": "semantic_chunk"
661
+ },
662
+ {
663
+ "id": "chunk132",
664
+ "chunk": "在污染物降解方面,Ji 等[87]通过在高拷贝质粒 pSEVA241-sgRNA 上组装phaCAB 基因表达模块, 构建的合成菌株H. bluephagenesis WZY278 可以降 解食物垃圾水解物并生产聚3- 羟基丁酸酯(poly- 3-hydroxybutyrate,PHB)",
665
+ "method": "semantic_chunk"
666
+ },
667
+ {
668
+ "id": "chunk133",
669
+ "chunk": "Kleinsteuber 等[88]将携带 有2,4-D、2- 甲基-4- 氯苯氧乙酸(2-methyl-4-chloro- phenoxyacetic acid,MCPA)和3- 氯苯甲酸(3-chloro- benzoic acid)降解基因的pJP4 质粒在Halomonas sp. EF43 中表达,菌株可实现在碱性条件对2,4-D 和 3-氯苯甲酸酯的降解",
670
+ "method": "semantic_chunk"
671
+ },
672
+ {
673
+ "id": "chunk134",
674
+ "chunk": "3.5.2 拜氏不动杆菌 拜氏不动杆菌(Acinetobacter baylyi)是一种革 兰氏阴性需氧细菌,由于其具有高自然转化率以及 高重组效率,被认为是下一代合成生物学的理想底 盘[89-90]",
675
+ "method": "semantic_chunk"
676
+ },
677
+ {
678
+ "id": "chunk135",
679
+ "chunk": "A. baylyi ADP1 的无害性、代谢多样性和高 适应能力,使其已被用于各种生物技术中,如降解 有机污染物、生产各种生化产品和不同种类的生物 聚合物[89, 91]",
680
+ "method": "semantic_chunk"
681
+ },
682
+ {
683
+ "id": "chunk136",
684
+ "chunk": "随着合成生物学的发展,非模式底盘研究的深 入,A. baylyi ADP1 常被用于研究细菌遗传和代谢 机制[92]",
685
+ "method": "semantic_chunk"
686
+ },
687
+ {
688
+ "id": "chunk137",
689
+ "chunk": "可利用无痕同源重组技术和CRISPR-Cas9 系统在A. baylyi ADP1 中进行基因的插入与删除",
690
+ "method": "semantic_chunk"
691
+ },
692
+ {
693
+ "id": "chunk138",
694
+ "chunk": "同时基于已建立的CRISPR-Cas9 系统,Suárez 等[92] 开发了一个“CRISPR-Lock”技术,用以促进简单的 多组分DNA 组装和快速可靠的基因组编辑",
695
+ "method": "semantic_chunk"
696
+ },
697
+ {
698
+ "id": "chunk139",
699
+ "chunk": "目前, 一系列调控生物元件已在A. baylyi ADP1 中得到 成功应用",
700
+ "method": "semantic_chunk"
701
+ },
702
+ {
703
+ "id": "chunk140",
704
+ "chunk": "近期,Biggs 等[90]构建了一个全面的A. baylyi ADP1 基因工具集,包括合成的组成启动子和 核糖体结合位点文库",
705
+ "method": "semantic_chunk"
706
+ },
707
+ {
708
+ "id": "chunk141",
709
+ "chunk": "唐慧等[93]通过在pWH1274 质粒上插入发光基因片段luxCDABE,将重组质 粒转化至A. baylyi ADP1 中,合成的菌株可用于对 Be(II)、Ba(II)、Cu(II)、Ni(II)等急性毒物的检测",
710
+ "method": "semantic_chunk"
711
+ },
712
+ {
713
+ "id": "chunk142",
714
+ "chunk": "Huang 等[94]将luxCDABE 插入pGEM-T 质粒中,重 组的ADPWH_lux 可作为生物传感器对水杨酸盐进 行快速和特异性的检测",
715
+ "method": "semantic_chunk"
716
+ },
717
+ {
718
+ "id": "chunk143",
719
+ "chunk": "3.5.3 需钠弧菌 需钠弧菌(Vibrio natriegens)为杆状海洋细菌, pH 耐受范围为5.5~9.5,具有无致病性、生长迅速、 遗传可操作性强、可接收环境DNA 和利用多种碳 源生长的能力,被认为是合成生物学未来最有潜力 的底盘[85, 95]",
720
+ "method": "semantic_chunk"
721
+ },
722
+ {
723
+ "id": "chunk144",
724
+ "chunk": "弧菌在质粒转化和蛋白异源表达方 面表现良好,电穿孔、热休克转化和结合等方法均 可成功地将质粒DNA 转入其中,包括p15A、pMB1 和pUC 及其衍生质粒等[2, 96-97]",
725
+ "method": "semantic_chunk"
726
+ },
727
+ {
728
+ "id": "chunk145",
729
+ "chunk": "最近,多项研究表 明,在大肠杆菌中适用的生物元件和基因编辑技术 也适用于弧菌,如启动子、核糖体结合位点、转录 终止子、同源重组和CRISPR-Cas9 系统等[95, 97-98]",
730
+ "method": "semantic_chunk"
731
+ },
732
+ {
733
+ "id": "chunk146",
734
+ "chunk": "基于弧菌从环境中吸收DNA 的能力,一种自然转 化多重基因组编辑(multiplex genome editing by natu- ral transformation,MuGENT)技术已经被开发,Dalia 等[99]成功地将MuGENT 应用于弧菌中,即可一步 同时将多个基因无痕共编辑至基因组中",
735
+ "method": "semantic_chunk"
736
+ },
737
+ {
738
+ "id": "chunk147",
739
+ "chunk": "在此基 础上,Stukenberg 等[100]将其与CRISPR-Cas9 反选择 相结合开发了一种高效基因组编辑技术NT-CRIS- PR,几乎可在100%的效率下进行多数基因组修饰, 如敲除、整合和点突变",
740
+ "method": "semantic_chunk"
741
+ },
742
+ {
743
+ "id": "chunk148",
744
+ "chunk": "在污染物降解方面,Huang 等[101]利用V. natrie- gens Vmax 在盐胁迫环境下降解环境污染物,分析 并鉴定了其耐盐机制和相关的盐诱导启动子,并在 V. natriegens Vmax 菌株上构建了对CP、六溴环十二 烷(hexabromocyclododecane,HBCD)和聚对苯二甲 酸乙二醇酯(polyethylene terephthalate,PET)三种盐 的诱导降解模型,实现了对相应底物的有效降解, 最终将该菌株与甲壳素材料结合,实现了菌株的回 收利用",
745
+ "method": "semantic_chunk"
746
+ },
747
+ {
748
+ "id": "chunk149",
749
+ "chunk": "除了上述提到的重要模式细胞以及具有特殊 性状的非模式细胞外,还有许多细菌可被开发用于 水污染治理,如用于废水硝酸盐检测的蓝藻内生 物传感器[102],体内含有降解染料废水中污染物酸 性兰113 关键酶编码基因的鞘氨醇单胞菌(Sphin- gomonas)等[103]",
750
+ "method": "semantic_chunk"
751
+ },
752
+ {
753
+ "id": "chunk150",
754
+ "chunk": "对于底盘细胞及降解相关元件的 进一步挖掘与开发有助于深化研究人员设计与构 建合成微生物的进程,并推动合成微生物在水污染 治理中的应用",
755
+ "method": "semantic_chunk"
756
+ },
757
+ {
758
+ "id": "chunk151",
759
+ "chunk": "4 合成微生物在水污染治理应用的优势 4.1 高效降解力与广泛降解谱 污水不仅来源多样,而且其中污染物数量多 而复杂",
760
+ "method": "semantic_chunk"
761
+ },
762
+ {
763
+ "id": "chunk152",
764
+ "chunk": "传统的物理、化学方法通常只能对特定的 林雅倩等:合成微生物与水污染治理应用 林雅倩等:合成微生物与水污染治理应用 激 光 生 物 学 报 426 第 33 卷 污染物发挥作用,不仅作用时间长,而且易受到环 境因素的影响,从而导致治理效果不尽如人意",
765
+ "method": "semantic_chunk"
766
+ },
767
+ {
768
+ "id": "chunk153",
769
+ "chunk": "生 物修复虽然较物理化学方法更高效,但直接将具有 较高降解力或代谢力的微生物投入污水中很可能 被环境中原有生物侵蚀,从而限制了其污水治理的 效率",
770
+ "method": "semantic_chunk"
771
+ },
772
+ {
773
+ "id": "chunk154",
774
+ "chunk": "合成生物学可根据特定目标污染物,系统地 研究微生物降解污染物过程的本质、规律和互作网 络等,深度挖掘各类污染物的相关降解元件,针对 性地构建高效稳定的合成微生物[104]",
775
+ "method": "semantic_chunk"
776
+ },
777
+ {
778
+ "id": "chunk155",
779
+ "chunk": "例如,Zhao 等[105]将γ-HCH和TCP的生物降解途径整合到P. pu- tida KT2440 中,构建的合成菌株可完全降解γ-HCH 和TCP",
780
+ "method": "semantic_chunk"
781
+ },
782
+ {
783
+ "id": "chunk156",
784
+ "chunk": "Zhang 等[106]对来源于伯克霍尔德氏菌 (Burkholderia sp.)的2,4-二硝基甲苯(2,4-dinitrotol- uene,2,4-DNT)降解途径的DntAa、DntAb 和DntAc 等8 个基因进行合成、优化,并在E. coli 上组装,所 构建的E. coli BL-4174能够完全降解2,4-DNT",
785
+ "method": "semantic_chunk"
786
+ },
787
+ {
788
+ "id": "chunk157",
789
+ "chunk": "因此,合成生物学不仅可以针对某一特定污染 物对底盘细胞进行设计和改造,还可根据治理目标 选择性地开发和利用多重降解相关元件,如转运元 件、趋化元件和抗逆元件等,通过对不同微生物来 源的元件进行设计和组合,构建具备复合代谢网络 的合成微生物,从而提升对污染物的降解能力、环 境适应能力以及拓展其污染物降解谱",
790
+ "method": "semantic_chunk"
791
+ },
792
+ {
793
+ "id": "chunk158",
794
+ "chunk": "Gong 等[107] 通过合成生物学的方法将4 个农药降解基因mpd、 pytH、mcd 和cehA 整合至P. putida KT2440 上,所得 菌株可实现对有机磷、拟除虫菊酯和氨基甲酸酯 3种农药的同时降解",
795
+ "method": "semantic_chunk"
796
+ },
797
+ {
798
+ "id": "chunk159",
799
+ "chunk": "4.2 环境友好和可持续 传统的污水治理方法可能会破坏水中的生态 系统、生物群落和生态链等",
800
+ "method": "semantic_chunk"
801
+ },
802
+ {
803
+ "id": "chunk160",
804
+ "chunk": "以合成生物学技术为 基础的合成微生物可针对污染物降解及其在环境 应用中的适应性和安全性技术开展工作",
805
+ "method": "semantic_chunk"
806
+ },
807
+ {
808
+ "id": "chunk161",
809
+ "chunk": "通过探 究微生物与微生物之间、微生物与环境之间的相互 作用机制,与生态学、计算生物学等学科结合,设 计构建高效智能的人工微生物降解体系,协调多重 元件在代谢水平上的相互作用",
810
+ "method": "semantic_chunk"
811
+ },
812
+ {
813
+ "id": "chunk162",
814
+ "chunk": "这不仅可增强合 成菌株的环境适应性,使其能够在高盐、酸碱、高 渗等极端条件下保持降解活性[108],并且有利于实 现复杂污染条件下的环境修复",
815
+ "method": "semantic_chunk"
816
+ },
817
+ {
818
+ "id": "chunk163",
819
+ "chunk": "合成微生物可通 过多重降解元件,建立能响应污染物的条件自毁基 因线路和遗传物质清除线路,使其能在自然条件 下持续主动安全的降解污染物,不产生二次污染, 具有可持续净化水环境并修复生态的特性",
820
+ "method": "semantic_chunk"
821
+ },
822
+ {
823
+ "id": "chunk164",
824
+ "chunk": "例如, Liu 等[109]将感应模块、降解模块、自杀系统整合至 E. coli TOP10 中并进行优化,所获得的菌株Pfic- TAT(2)-P100-RBS35 在无外源诱导剂的情况下,可 对水杨酸(salicylicacid,SA)进行传感并降解,最后 当环境中无SA 时菌株会自主激活自杀系统,且能 长期稳定存在于水环境中发挥功能",
825
+ "method": "semantic_chunk"
826
+ },
827
+ {
828
+ "id": "chunk165",
829
+ "chunk": "4.3 作用形式多样化 合成生物学的发展为合成微生物的多样化应 用提供了新的契机(图3)",
830
+ "method": "semantic_chunk"
831
+ },
832
+ {
833
+ "id": "chunk166",
834
+ "chunk": "为避免单一菌株的过度 工程化及增加其代谢负担,研究者们通过构建合成 微生物群落的方式,让菌群成员间通过信号交换、 检测及相互响应等方式进行交流,并通过相关基 因表达调控协调群体行为",
835
+ "method": "semantic_chunk"
836
+ },
837
+ {
838
+ "id": "chunk167",
839
+ "chunk": "合成微生物群落不仅 比传统菌群及单一微生物具有更多的复杂功能,而 且具有更高的稳定性与鲁棒性,可适用于复杂环境 和极端环境的生长及维持平稳的代谢状态[104],因 此在水环境修复中发挥着不可替代的作用",
840
+ "method": "semantic_chunk"
841
+ },
842
+ {
843
+ "id": "chunk168",
844
+ "chunk": "Zhang 等[110]构建了一个由鞘氨醇单胞菌属和假单胞菌属 微生物组成的合成微生物群落,该群落可有效地利 用菲或二苯并噻吩作为唯一的碳源生长,同时还可 代谢其他多种多环芳烃(polycyclic aromatic hydrocar- bons,PAHs)和杂环衍生物",
845
+ "method": "semantic_chunk"
846
+ },
847
+ {
848
+ "id": "chunk169",
849
+ "chunk": "在模拟废水试验中,该 合成群落5 d 内去除了100% 的PAHs,且在多次循 环后依旧保持稳定的降解力",
850
+ "method": "semantic_chunk"
851
+ },
852
+ {
853
+ "id": "chunk170",
854
+ "chunk": "Zhang 等[111]构建了 一个由类节杆菌属、根瘤菌属、红球菌属、代尔夫 特菌属和硝酸盐还原菌属微生物组成的耐盐微生 物群落,该合成群落对模拟乙酰乙酰苯胺(acetoac- etanilide,AAA)废水和实际AAA母液有良好的降解 能力,且处理后AAA母液的综合毒性显著降低",
855
+ "method": "semantic_chunk"
856
+ },
857
+ {
858
+ "id": "chunk171",
859
+ "chunk": "除了降解污染物外,合成微生物在污染物检 测方面也起着举足轻重的作用",
860
+ "method": "semantic_chunk"
861
+ },
862
+ {
863
+ "id": "chunk172",
864
+ "chunk": "运用合成生物学 的模块化和可编程性,将待测物质的感应元件(转 录因子和核糖开关等)与信号输出的报告元件(荧 光素酶、荧光蛋白和荧光适配体等)通过基因表达 调控的方式在微生物底盘上偶联[112],即可形成简 单的生物传感器,识别单一目标物质并改善信号识 别、处理和输出等传感过程",
865
+ "method": "semantic_chunk"
866
+ },
867
+ {
868
+ "id": "chunk173",
869
+ "chunk": "Hui 等[113]利用Cd(II) 感应元件与靛蓝生物合成基因簇融合在E. coli 中 成功构建了一个视觉生物传感器",
870
+ "method": "semantic_chunk"
871
+ },
872
+ {
873
+ "id": "chunk174",
874
+ "chunk": "该传感器仅对 Cd(II)有选择性响应,且适用于水环境的分析",
875
+ "method": "semantic_chunk"
876
+ },
877
+ {
878
+ "id": "chunk175",
879
+ "chunk": "将 427 第5 期 林雅倩等:合成微生物与水污染治理应用 林雅倩等:合成微生物与水污染治理应用 表1 应用于污染物降解的合成微生物底盘细胞 Tab.1 Examples of synthetic microorganisms for water pollution application Chassis cells Methodology Application References E. coli BL221-AI Insertion of pheA1, pheA2, catA, catB, catC, catD, pcaI, pcaJ and pcaF Degradation of phenol [50] E. coli BL21-AI Insertion of tfdA, tfdB, tfdC, tfdD, tfdE, tfdF, pcaI, pcaJ and pcaF Degradation of 2, 4-D [51] E. coli BL21-AI Insertion of DntAa, DntAb, DntAc, DntAd, DntB, DntD, DntG and DntE Degradation of 2, 4-DNT [106] E. coli TOP10 Insertion of the salicylicacid (SA) detection, degradation, and suicide pathway genes Pfic‐TAT‐P1XX‐RBS Detection and degradation of SA [109] E. coli TOP10 Insertion of the Cd sensing pathway gene pPcad-ind Detection of Cd [113] E. coli TOP10 Insertion of the Hg and Cd sensory elements and the separate reporters pR-Pcad-Pmer-G Enhanced detection of Hg and Cd [114] P. putida KT2440 Insertion of mpd, pnpA, pnpB, gfp, linA, linB, linC, linD and vgb Degradation of MP and γ-HCH [58] P. putida KT2440 Deletion of GIs, insertion of the degradation pathway genes of γ-HCH and TCP Enhanced degradation of γ-HCH and TCP [59] P. putida KT2440 Deletion of hsdRMS, insertion of linA, linB, linC, linD, dhaA31, hheC and echA Enhanced degradation of γ-HCH and TCP [105] P. putida KT2440 Insertion of mpd, pytH, mcd, cehA, gfp and vgb Degradation of multiple pesticides [107] B. subtilis Design and Insertion of metal sensor modules MerRZntR Detection of heavy metal ions [61] B. subtilis ZN0871v11 Deletion of yqfY and spo0A Adsorption of organic molecules and ions [63] B. subtilis N4 Insertion of nha and ami Degradation of nitriles [67] B. subtilis 1935 Deletion of epsA~O and bslA, and expression of mtagBFP, eGFP, mCherry and TasA-MT Detection and adsorption of mul- tiple heavy metal ions [68] S. cerevisiae Insertion of CTR1 and CUP1 Detection of Cu [74] S. cerevisiae W303a Insertion of JLP1-lacZ Detection of Cd [75] S. cerevisiae BY4741 Insertion of myrGFP-MTx Adsorption of heavy metal ions (Cu, Ni and Mn) [76] S. cerevisiae EBY100 Insertion of 4R-PdBP and aga2, overexpression of aga1 Adsorption of Pd [77] H. Bluephagenesi WZY278 Overexpression of phaCABCn and linkage with ompW Recovery of food wastes [87] Halomonas sp. EF43 Insertion of the 2, 4-D, MCPA and 3-chlorobenzoate acid degradation pathway genes Degradation of 2, 4-D and 3-chlorobenzoate acid [88] Acinetobacter sp. ADP1 Insertion of luxCDABE Detection of multiple metal ions [93] Acinetobacter sp. ADP1 Insertion of luxCDABE Detection of SA [94] V. natriegens Vmax Insertion of the PET, CP and HBCDs degradation pathway genes Degradation of PET, CP and HBCDs [101] 底盘与其他功能元件或由几个相关功能元件组成 的完成某一任务的功能模块连接,还可构建复杂的 生物传感器,用于环境中存在多种污染物时的检 测",
880
+ "method": "semantic_chunk"
881
+ },
882
+ {
883
+ "id": "chunk176",
884
+ "chunk": "Hui 等[114]通过将两种荧光报告元件分别置于 Hg(II)传感元件或Cd(II)感应元件的控制下,并将 两种感应元件从不同方向整合至E. coli TOP10 中, 成功构建了一个双传感生物传感器",
885
+ "method": "semantic_chunk"
886
+ },
887
+ {
888
+ "id": "chunk177",
889
+ "chunk": "由于生物传 感器对环境的抗干扰能力通常较强,且传感器细胞 内部所有基因元件可以通过基因复制和细胞增殖 而自动扩增,因此,生物传感器在生产和环境监测 应用中都具有明显优势",
890
+ "method": "semantic_chunk"
891
+ },
892
+ {
893
+ "id": "chunk178",
894
+ "chunk": "5 总结与展望 随着人类社会的发展,以及工业、生活、农业、 医疗活动的进行,各种污染物如重金属、有毒有机 化合物和致病微生物等进入水环境,造成了本就脆 弱的生态系统的进一步恶化",
895
+ "method": "semantic_chunk"
896
+ },
897
+ {
898
+ "id": "chunk179",
899
+ "chunk": "尽管人们对污水治 理的关注日益增加,但现阶段的污水实际治理仍然 激 光 生 物 学 报 428 第 33 卷 缺乏科学性和全面性,很难根据污水的具体情况制 定科学的治理方案,且没有完善的污水排放管理体 系,在一定程度上加大了水污染治理工作的难度",
900
+ "method": "semantic_chunk"
901
+ },
902
+ {
903
+ "id": "chunk180",
904
+ "chunk": "合成生物学将工程、科学和技术结合应用于编 辑生物体的遗传信息,使其能够执行新的功能",
905
+ "method": "semantic_chunk"
906
+ },
907
+ {
908
+ "id": "chunk181",
909
+ "chunk": "应 用合成生物学开发新兴污染物降解菌株,已成为 合成生物学和微生物学领域的研究前沿",
910
+ "method": "semantic_chunk"
911
+ },
912
+ {
913
+ "id": "chunk182",
914
+ "chunk": "不同于 以从环境中筛选能够代谢特定污染物的天然微生 物为重点的传统环境生物技术,合成微生物是通过 在底盘细胞中插入天然或非天然功能元件、构建底 盘中原本不存在的合成代谢途径,形成全新的可感 知、降解污染物的微生物",
915
+ "method": "semantic_chunk"
916
+ },
917
+ {
918
+ "id": "chunk183",
919
+ "chunk": "研究者们可根据目的定 向改造微生物,���其具有降解目标污染物、提高降 解效率和增强环境适应性等能力",
920
+ "method": "semantic_chunk"
921
+ },
922
+ {
923
+ "id": "chunk184",
924
+ "chunk": "近年来,国内外 研究人员开展了各种有关合成微生物的鉴定、筛 选、关键元件挖掘和改造等方面的工作,目前已获 得众多重要突破,如开发了由多种微生物参与的污 染物传感器,构建了合成微生物降解群落以及复合 材料等[115-117]",
925
+ "method": "semantic_chunk"
926
+ },
927
+ {
928
+ "id": "chunk185",
929
+ "chunk": "因此,合成微生物在环境修复领域中 的地位愈发重要",
930
+ "method": "semantic_chunk"
931
+ },
932
+ {
933
+ "id": "chunk186",
934
+ "chunk": "但目前合成微生物的具体实施仍存在一些挑 战",
935
+ "method": "semantic_chunk"
936
+ },
937
+ {
938
+ "id": "chunk187",
939
+ "chunk": "例如:1)污水情况复杂、难以估计,多种污染 物分解代谢途径和关键降解酶编码基因不清晰",
940
+ "method": "semantic_chunk"
941
+ },
942
+ {
943
+ "id": "chunk188",
944
+ "chunk": "2)微生物具有物种和基因多样性、生态网络复杂 性,使其难以标准化,无法确保搭建的功能元件和 模块在不同系统中可拥有同样高效的工作方式和 结果",
945
+ "method": "semantic_chunk"
946
+ },
947
+ {
948
+ "id": "chunk189",
949
+ "chunk": "3)在特定底盘中加载和运行的合成DNA 数 量有明显限制,添加合成生物元件无疑会给底盘带 来非自然的额外负担,产生生长速度减慢、基因变 异等副作用",
950
+ "method": "semantic_chunk"
951
+ },
952
+ {
953
+ "id": "chunk190",
954
+ "chunk": "4)目前,合成微生物的构建及应用研 究多处于实验室研究阶段,多采用模拟污水进行, 要想运用合成微生物在实际污水环境中,需要进行 更多的现场试验以确保生物安全和效率",
955
+ "method": "semantic_chunk"
956
+ },
957
+ {
958
+ "id": "chunk191",
959
+ "chunk": "尽管合 成微生物在污水治理应用方面面临着诸多挑战,但 是,随着合成生物学、系统生物学和基因工程等学 科的发展,学者们相信,在可预见的未来,在更多理 论和实践的支持下,合成微生物具有治理水污染、 解决环境问题的巨大潜力",
960
+ "method": "semantic_chunk"
961
+ },
962
+ {
963
+ "id": "chunk192",
964
+ "chunk": "参考文献(References): [1] JANET JOSHIBA G, SENTHIL KUMAR P, CHRISTOPHER F C, et al. Insights of CMNPs in water pollution control [J]. IET Nanobiotechnology, 2019, 13(6): 553-559. [2] XIANG L, LI G, WEN L, et al. Biodegradation of aromatic pollut- ants meets synthetic biology [J]. Synthetic and Systems Biotech- nology, 2021, 6(3): 153-162. [3] TRIPATHI M, SINGH S, PATHAK S, et al. Recent strategies for the remediation of textile dyes from wastewater: a systematic re- view [J]. Toxics, 2023, 11(11): 940. [4] ZHANG Y, YU H, ZHAI R, et al. Recent progress in photocata- lytic degradation of water pollution by bismuth tungstate [J]. Molecules, 2023, 28(24): 8011. [5] NAGDA A, MEENA M, SHAH M P. Bioremediation of industrial effluents: a synergistic approach [J]. Journal of Basic Microbiol- ogy, 2022, 62(3/4): 395-414. [6] DUTTA D, ARYA S, KUMAR S. Industrial wastewater treatment: current trends, bottlenecks, and best practices [J]. Chemosphere, 2021, 285: 131245. [7] MAO G, HAN Y, LIU X, et al. Technology status and trends of in- dustrial wastewater treatment: a patent analysis [J]. Chemosphere, 2022, 288(Pt 2): 132483. [8] PRABAKAR D, SUVETHA K S, MANIMUDI V T, et al. Pre- treatment technologies for industrial effluents: critical review on bioenergy production and environmental concerns [J]. Journal of Environmental Management, 2018, 218: 165-180. [9] RAJ A, DUBEY A, MALLA M A, et al. Pesticide pestilence: global scenario and recent advances in detection and degradation methods [J]. Journal of Environmental Management, 2023, 338: 117680. [10] TUDI M, DANIEL RUAN H, WANG L, et al. Agriculture devel- opment, pesticide application and its impact on the environment [J]. International Journal of Environmental Research and Public Health, 2021, 18(3): 1112. [11] SYAFRUDIN M, KRISTANTI R A, YUNIARTO A, et al. Pes- ticides in drinking water: a review [J]. International Journal of Environmental Research and Public Health, 2021, 18(2): 468. [12] BOONUPARA T, UDOMKUN P, KHAN E, et al. Airborne pes- ticides from agricultural practices: a critical review of pathways, influencing factors, and human health implications [J]. Toxics, 2023, 11(10): 858. [13] MACKULAK T, CVERENKÁROVÁ K, VOJS STAŇOVÁ A, et al. Hospital wastewater-source of specific micropollutants, antibiotic-resistant microorganisms, viruses, and their elimination [J]. Antibiotics (Basel), 2021, 10(9): 1070. [14] DEKKER H M, STROOMBERG G J, PROKOP M. Tackling the increasing contamination of the water supply by iodinated contrast media [J]. Insights into Imaging, 2022, 13(1): 30. [15] TANG Y, ZHAO S, PENG Z, et al. Cu2O nanoparticles anchored on carbon for the efficient removal of propofol from operating room wastewater via peroxymonosulfate activation: efficiency, mechanism, and pathway [J]. Royal Society of Chemistry, 2021, 11(34): 20983-20991. [16] BALAKRISHNAN A, JACOB M M, SENTHIL KUMAR P, et al. Strategies for safe management of hospital wastewater during the COVID-19 pandemic [J]. International Journal of Environmental Science and Technology: IJEST, 2023, 20(12): 1-16. [17] RODRÍGUEZ-SERIN H, GAMEZ-JARA A, DE LA CRUZ- NORIEGA M, et al. Literature review: evaluation of drug removal techniques in municipal and hospital wastewater [J]. International Journal of Environmental Research and Public Health, 2022, 19(20): 13105. [18] PARIENTE M I, SEGURA Y, ÁLVAREZ-TORRELLAS S, et al. Critical review of technologies for the on-site treatment of hospital wastewater: from conventional to combined advanced processes [J]. Journal of Environmental Management, 2022, 320: 115769. [19] WIDYARANI, WULAN D R, HAMIDAH U, et al. Domestic 429 第5 期 wastewater in Indonesia: generation, characteristics and treatment [J]. Environmental Science and Pollution Research International, 2022, 29(22): 32397-32414. [20] BAJPAI M, KATOCH S S, CHATURVEDI N K. Comparative study on decentralized treatment technologies for sewage and graywater reuse: a review [J]. Water Science and Technology, 2019, 80(11): 2091-2106. [21] ZHANG W, CHU H, YANG L, et al. Technologies for pollutant removal and resource recovery from blackwater: a review [J]. Frontiers of Environmental Science & Engineering, 2023, 17(7): 83. [22] KHAJVAND M, MOSTAFAZADEH A K, DROGUI P, et al. Grey- water characteristics, impacts, treatment, and reclamation using adsorption processes towards the circular economy [J]. Environ- mental Science and Pollution Research International, 2022, 29(8): 10966-11003. [23] RASHID R, SHAFIQ I, AKHTER P, et al. A state-of-the-art review on wastewater treatment techniques: the effectiveness of adsorption method [J]. Environmental Science and Pollution Re- search International, 2021, 28(8): 1-17. [24] KIM S, NAM S N, JANG A, et al. Review of adsorption-mem- brane hybrid systems for water and wastewater treatment [J]. Chemosphere, 2022, 286(Pt 3): 131916. [25] KARIMI-MALEH H, SHAFIEIZADEH M, TAHER M A, et al. The role of magnetite/graphene oxide nano-composite as a high- efficiency adsorbent for removal of phenazopyridine residues from water samples, an experimental/theoretical investigation [J]. Jour- nal of Molecular Liquids, 2020, 298: 112040. [26] SPONZA D T, ALICANOGLU P. Reuse and recovery of raw hospital wastewater containing ofloxacin after photocatalytic treat- ment with nano graphene oxide magnetite [J]. Water Science and Technology, 2018, 77(1/2): 304-322. [27] AZUMA T, KATAGIRI M, SEKIZUKA T, et al. Inactivation of bacteria and residual antimicrobials in hospital wastewater by ozone treatment [J]. Antibiotics, 2022, 11(7): 862. [28] TAOUFIK N, BOUMYA W, ACHAK M, et al. Comparative over- view of advanced oxidation processes and biological approaches for the removal pharmaceuticals [J]. Journal of Environmental Management, 2021, 288: 112404. [29] GARRIDO-CARDENAS J A, ESTEBAN-GARCÍA B, AGÜERA A, et al. Wastewater treatment by advanced oxidation process and their worldwide research trends [J]. International Journal of Envi- ronmental Research and Public Health, 2019, 17(1): 170. [30] YANG W, LIU G, CHEN Y, et al. Persulfate enhanced electro- chemical oxidation of highly toxic cyanide-containing organic wastewater using boron-doped diamond anode [J]. Chemosphere, 2020, 252: 126499. [31] SAEED M U, HUSSAIN N, SUMRIN A, et al. Microbial biore- mediation strategies with wastewater treatment potentialities: a re- view [J]. Science of the Total Environment, 2022, 818: 151754. [32] SHARMA M, AGARWAL S, AGARWAL MALIK R, et al. Recent advances in microbial engineering approaches for wastewater treatment: a review [J]. Bioengineered, 2023, 14(1): 2184518. [33] PATEL H, YADAV V K, YADAV K K, et al. A recent and systemic approach towards microbial biodegradation of dyes from textile industries [J]. Water, 2022, 14(19): 3163. [34] MANTOVANI M, ROSSI S, FICARA E, et al. Removal of phar- maceutical compounds from the liquid phase of anaerobic sludge in a pilot-scale high-rate algae-bacteria pond [J]. Science of the Total Environment, 2024, 908: 167881. [35] CERETTA M B, NERCESSIAN D, WOLSKI E A. Current trends on role of biological treatment in integrated treatment technologies of textile wastewater [J]. Frontiers in Microbiology, 2021, 12: 651025. [36] AMINIAN-DEHKORDI J, RAHIMI S, GOLZAR-AHMADI M, et al. Synthetic biology tools for environmental protection [J]. Biotechnology Advances, 2023, 68: 108239. [37] CHAKRABORTY R, WU C H, HAZEN T C. Systems biology ap- proach to bioremediation [J]. Current Opinion in Biotechnology, 2012, 23(3): 483-490. [38] LIU Y, SU A, LI J, et al. Towards next-generation model microor- ganism chassis for biomanufacturing [J]. Applied Microbiology and Biotechnology, 2020, 104(21): 1-14. [39] DE LORENZO V, KRASNOGOR N, SCHMIDT M. For the sake of the bioeconomy: define what a synthetic biology chassis is",
965
+ "method": "semantic_chunk"
966
+ },
967
+ {
968
+ "id": "chunk193",
969
+ "chunk": "[J]. New Biotechnology, 2021, 60: 44-51. [40] GARNER K L. Principles of synthetic biology [J]. Essays in Bio- chemistry, 2021, 65(5): 791-811. [41] SRIDHAR S, AJO-FRANKLIN C M, MASIELLO C A. A frame- work for the systematic selection of biosensor chassis for envi- ronmental synthetic biology [J]. ACS Synthetic Biology, 2022, 11(9): 2909-2916. [42] AHANKOUB M, MARDANI G, GHASEMI-DEHKORDI P, et al. Biodecomposition of phenanthrene and pyrene by a genetically en- gineered Escherichia coli [J]. Recent Patents on Biotechnology, 2020, 14(2): 121-133. [43] TONG C, LIANG Y, ZHANG Z, et al. Review of knockout tech- nology approaches in bacterial drug resistance research [J]. PeerJ, 2023, 11: e15790. [44] YU M, HU S, TANG B, et al. Engineering Escherichia coli Nissle 1917 as a microbial chassis for therapeutic and industrial applications [J]. Biotechnology Advances, 2023, 67: 108202. [45] PU W, CHEN J, ZHOU Y, et al. Systems metabolic engineering of Escherichia coli for hyper-production of 5-aminolevulinic acid [J]. Biotechnology for Biofuels and Bioproducts, 2023, 16(1): 31. [46] GE J, WANG X, BAI Y, et al. Engineering Escherichia coli for efficient assembly of heme proteins [J]. Microbial Cell Factories, 2023, 22(1): 59. [47] LI Q, SUN B, CHEN J, et al. A modified pCas/pTargetF system for CRISPR-Cas9-assisted genome editing in Escherichia coli [J]. Acta Biochim Biophys Sin (Shanghai), 2021, 53(5): 620-627. [48] MORI H, KATAOKA M, YANG X. Past, present, and future of genome modification in Escherichia coli [J]. Microorganisms, 2022, 10(9): 1835. [49] ZHU L, LIANG Z, XU Y, et al. Ultrasensitive and rapid visual detection of Escherichia coli O157:H7 based on RAA-CRISPR/ Cas12a system [J]. Biosensors, 2023, 13(6): 659. [50] WANG B, XU J, GAO J, et al. Construction of an Escherichia coli strain to degrade phenol completely with two modified meta- bolic modules [J]. Journal of Hazardous Materials, 2019, 373: 29-38. [51] WANG Y, TIAN Y S, GAO J J, et al. Complete biodegradation of the oldest organic herbicide 2,4-dichlorophenoxyacetic acid by engineering Escherichia coli [J]. Journal of Hazardous Materials, 2023, 451: 131099. [52] LV X, LI Y, XIU X, et al. CRISPR genetic toolkits of classical food microorganisms: current state and future prospects [J]. Bio- technology Advances, 2023, 69: 108261. [53] WEIMER A, KOHLSTEDT M, VOLKE D C, et al. Industrial biotechnology of Pseudomonas putida: advances and prospects [J]. Applied Microbiology and Biotechnology, 2020, 104(18): 7745-7766. [54] SUN J, WANG Q, JIANG Y, et al. Genome editing and transcrip- tional repression in Pseudomonas putida KT2440 via the type II CRISPR system [J]. Microbial Cell Factories, 2018, 17(1): 41. 林雅倩等:合成微生物与水污染治理应用 林雅倩等:合成微生物与水污染治理应用 激 光 生 物 学 报 430 第 33 卷 [55] MARTÍNEZ-GARCÍA E, DE LORENZO V. Pseudomonas putida as a synthetic biology chassis and a metabolic engineering plat- form [J]. Current Opinion in Biotechnology, 2024, 85: 103025. [56] APARICIO T, DE LORENZO V, MARTÍNEZ-GARCÍA E. CRIS- PR/Cas9-enhanced ssDNA recombineering for Pseudomonas putida [J]. Microbial Biotechnology, 2019, 12(5): 1076-1089. [57] LIANG T, SUN J, JU S, et al. Construction of T7-like expression system in Pseudomonas putida KT2440 to enhance the heter- ologous expression level [J]. Frontiers in Chemistry, 2021, 9: 664967. [58] GONG T, LIU R, ZUO Z, et al. Metabolic engineering of Pseu- domonas putida KT2440 for complete mineralization of methyl parathion and γ-Hexachlorocyclohexane [J]. ACS Synthetic Biol- ogy, 2016, 5(5): 434-442. [59] LIANG P, ZHANG Y, XU B, et al. Deletion of genomic islands in the Pseudomonas putida KT2440 genome can create an optimal chassis for synthetic biology applications [J]. Microbial Cell Fac- tories, 2020, 19(1): 70. [60] FERRANDO J, FILLUELO O, ZEIGLER D R, et al. Barriers to simultaneous multilocus integration in Bacillus subtilis tumble down: development of a straightforward screening method for the colorimetric detection of one-step multiple gene insertion using the CRISPR-Cas9 system [J]. Microbial Cell Factories, 2023, 22(1): 21. [61] GHATAORA J S, GEBHARD S, REEKSTING B J. Chimeric MerR-Family regulators and logic elements for the design of metal sensitive genetic circuits in Bacillus subtilis [J]. ACS Synthetic Biology, 2023, 12(3): 735-749. [62] SU Y, LIU C, FANG H, et al. Bacillus subtilis: a universal cell factory for industry, agriculture, biomaterials and medicine [J]. Microbial Cell Factories, 2020, 19(1): 173. [63] ZHU W, LIU Y, CAO X, et al. Recovering organic matters and ions from wastewater by genetically engineered Bacillus subtilis biomass [J]. Journal of Environmental Management, 2015, 161: 402-407. [64] QIN Y, ANGELINI L L, CHAI Y. Bacillus subtilis cell differ- entiation, biofilm formation and environmental prevalence [J]. Microorganisms, 2022, 10(6): 1108. [65] MOHSIN M Z, OMER R, HUANG J, et al. Advances in engi- neered Bacillus subtilis biofilms and spores, and their applications in bioremediation, biocatalysis, and biomaterials [J]. Synthetic and Systems Biotechnology, 2021, 6(3): 180-191. [66] SHI Y, CHEN T, SHAW P, et al. Manipulating bacterial biofilms using materiobiology and synthetic biology approaches [J]. Fron- tiers in Microbiology, 2022, 13: 844997. [67] LI C, SUN Y, YUE Z, et al. Combination of a recombinant bacte- rium with organonitrile-degrading and biofilm-forming capability and a positively charged carrier for organonitriles removal [J]. Journal of Hazardous Materials, 2018, 353: 372-380. [68] ZHU X, XIANG Q, CHEN L, et al. Engineered Bacillus subtilis Biofilm@Biochar living materials for in-situ sensing and biore- mediation of heavy metal ions pollution [J]. Journal of Hazardous Materials, 2024, 465: 133119. [69] WU Y, FENG S, SUN Z, et al. An outlook to sophisticated tech- nologies and novel developments for metabolic regulation in the Saccharomyces cerevisiae expression system [J]. Frontiers in Bioengineering and Biotechnology, 2023, 11: 1249841. [70] TANG H, WU Y, DENG J, et al. Promoter architecture and pro- moter engineering in Saccharomyces cerevisiae [J]. Metabolites, 2020, 10(8): 320. [71] PARAPOULI M, VASILEIADIS A, AFENDRA A S, et al. Sac- charomyces cerevisiae and its industrial applications [J]. AIMS Microbiology, 2020, 6(1): 1-31. [72] CHEN B, LEE H L, HENG Y C, et al. Synthetic biology toolkits and applications in Saccharomyces cerevisiae [J]. Biotechnology Advances, 2018, 36(7): 1870-1881. [73] SATO G, KURODA K. Overcoming the limitations of CRISPR- Cas9 systems in Saccharomyces cerevisiae: off-target effects, epigenome, and mitochondrial editing [J]. Microorganisms, 2023, 11(4): 1040. [74] FAN C, ZHANG D, MO Q, et al. Engineering Saccharomyces cerevisiae-based biosensors for copper detection [J]. Microbial Biotechnology, 2022, 15(11): 2854-2860. [75] ITO-HARASHIMA S, MIZUTANI Y, NISHIMURA M, et al. A pilot study for construction of a new cadmium-sensing yeast strain carrying a reporter plasmid with the JLP1 promoter [J]. The Jour- nal of Toxicological Sciences, 2017, 42(1): 103-109. [76] RUTA L L, LIN Y F, KISSEN R, et al. Anchoring plant metallo- thioneins to the inner face of the plasma membrane of Saccha- romyces cerevisiae cells leads to heavy metal accumulation [J]. PLoS One, 2017, 12(5): e0178393. [77] MASHANGOANE B F, CHIRWA E N. Cell surface display of pal- ladium binding peptide on Saccharomyces cerevisiae EBY100 cells using the a-agglutinin anchor system developed for the bio- sorption of Pd (II) [J]. Minerals Engineering, 2022, 176: 107325. [78] YE J, CHEN G. Halomonas as a chassis [J]. Essays in Biochem- istry, 2021, 65(2): 393-403. [79] AMINI B, OTADI M, PARTOVINIA A. Statistical modeling and optimization of toluidine red biodegradation in a synthetic waste- water using Halomonas strain Gb [J]. Journal of Environmental Health Science & Engineering, 2019, 17(1): 319-330. [80] LIU C, YUE Y, XUE Y, et al. CRISPR-Cas9 assisted non- homologous end joining genome editing system of Halomonas bluephagenesis for large DNA f",
970
+ "method": "semantic_chunk"
971
+ }
972
+ ]
output_files/semantic_chunk_vector.json ADDED
The diff for this file is too large to render. See raw diff
 
rag.py ADDED
@@ -0,0 +1,2065 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import faiss
3
+ import numpy as np
4
+ from llama_index.core.node_parser import SentenceSplitter
5
+ import re
6
+ from typing import List, Dict, Any, Optional, Tuple
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ import json
9
+ import shutil
10
+ from typing import Optional
11
+ from openai import OpenAI
12
+ import gradio as gr
13
+ import os
14
+ import fitz # PyMuPDF
15
+ import chardet # 用于自动检测编码
16
+ import traceback
17
+ from config import Config # 导入配置文件
18
+
19
+ # 创建知识库根目录和临时文件目录
20
+ KB_BASE_DIR = Config.kb_base_dir
21
+ os.makedirs(KB_BASE_DIR, exist_ok=True)
22
+
23
+ # 创建默认知识库目录
24
+ DEFAULT_KB = Config.default_kb
25
+ DEFAULT_KB_DIR = os.path.join(KB_BASE_DIR, DEFAULT_KB)
26
+ os.makedirs(DEFAULT_KB_DIR, exist_ok=True)
27
+
28
+ # 创建临时输出目录
29
+ OUTPUT_DIR = Config.output_dir
30
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
31
+
32
+ client = OpenAI(
33
+ api_key=Config.llm_api_key,
34
+ base_url=Config.llm_base_url
35
+ )
36
+
37
+ class DeepSeekClient:
38
+ def generate_answer(self, system_prompt, user_prompt, model=Config.llm_model):
39
+ response = client.chat.completions.create(
40
+ model=Config.llm_model,
41
+ messages=[
42
+ {"role": "system", "content": system_prompt},
43
+ {"role": "user", "content": user_prompt}
44
+ ],
45
+ stream=False
46
+ )
47
+ return response.choices[0].message.content.strip()
48
+
49
+ # 获取知识库列表
50
+ def get_knowledge_bases() -> List[str]:
51
+ """获取所有知识库名称"""
52
+ try:
53
+ if not os.path.exists(KB_BASE_DIR):
54
+ os.makedirs(KB_BASE_DIR, exist_ok=True)
55
+
56
+ kb_dirs = [d for d in os.listdir(KB_BASE_DIR)
57
+ if os.path.isdir(os.path.join(KB_BASE_DIR, d))]
58
+
59
+ # 确保默认知识库存在
60
+ if DEFAULT_KB not in kb_dirs:
61
+ os.makedirs(os.path.join(KB_BASE_DIR, DEFAULT_KB), exist_ok=True)
62
+ kb_dirs.append(DEFAULT_KB)
63
+
64
+ return sorted(kb_dirs)
65
+ except Exception as e:
66
+ print(f"获取知识库列表失败: {str(e)}")
67
+ return [DEFAULT_KB]
68
+
69
+ # 创建新知识库
70
+ def create_knowledge_base(kb_name: str) -> str:
71
+ """创建新的知识库"""
72
+ try:
73
+ if not kb_name or not kb_name.strip():
74
+ return "错误:知识库名称不能为空"
75
+
76
+ # 净化知识库名称,只允许字母、数字、下划线和中文
77
+ kb_name = re.sub(r'[^\w\u4e00-\u9fff]', '_', kb_name.strip())
78
+
79
+ kb_path = os.path.join(KB_BASE_DIR, kb_name)
80
+ if os.path.exists(kb_path):
81
+ return f"知识库 '{kb_name}' 已存在"
82
+
83
+ os.makedirs(kb_path, exist_ok=True)
84
+ return f"知识库 '{kb_name}' 创建成功"
85
+ except Exception as e:
86
+ return f"创建知识库失败: {str(e)}"
87
+
88
+ # 删除知识库
89
+ def delete_knowledge_base(kb_name: str) -> str:
90
+ """删除指定的知识库"""
91
+ try:
92
+ if kb_name == DEFAULT_KB:
93
+ return f"无法删除默认知识库 '{DEFAULT_KB}'"
94
+
95
+ kb_path = os.path.join(KB_BASE_DIR, kb_name)
96
+ if not os.path.exists(kb_path):
97
+ return f"知识库 '{kb_name}' 不存在"
98
+
99
+ shutil.rmtree(kb_path)
100
+ return f"知识库 '{kb_name}' 已删除"
101
+ except Exception as e:
102
+ return f"删除知识库失败: {str(e)}"
103
+
104
+ # 获取知识库文件列表
105
+ def get_kb_files(kb_name: str) -> List[str]:
106
+ """获取指定知识库中的文件列表"""
107
+ try:
108
+ kb_path = os.path.join(KB_BASE_DIR, kb_name)
109
+ if not os.path.exists(kb_path):
110
+ return []
111
+
112
+ # 获取所有文件(排除索引文件和元数据文件)
113
+ files = [f for f in os.listdir(kb_path)
114
+ if os.path.isfile(os.path.join(kb_path, f)) and
115
+ not f.endswith(('.index', '.json'))]
116
+
117
+ return sorted(files)
118
+ except Exception as e:
119
+ print(f"获取知识库文件列表失败: {str(e)}")
120
+ return []
121
+
122
+ # 语义分块函数
123
+ def semantic_chunk(text: str, chunk_size=800, chunk_overlap=20) -> List[dict]:
124
+ class EnhancedSentenceSplitter(SentenceSplitter):
125
+ def __init__(self, *args, **kwargs):
126
+ custom_seps = [";", "!", "?", "\n"]
127
+ separators = [kwargs.get("separator", "。")] + custom_seps
128
+ kwargs["separator"] = '|'.join(map(re.escape, separators))
129
+ super().__init__(*args, **kwargs)
130
+
131
+ def _split_text(self, text: str, **kwargs) -> List[str]:
132
+ splits = re.split(f'({self.separator})', text)
133
+ chunks = []
134
+ current_chunk = []
135
+ for part in splits:
136
+ part = part.strip()
137
+ if not part:
138
+ continue
139
+ if re.fullmatch(self.separator, part):
140
+ if current_chunk:
141
+ chunks.append("".join(current_chunk))
142
+ current_chunk = []
143
+ else:
144
+ current_chunk.append(part)
145
+ if current_chunk:
146
+ chunks.append("".join(current_chunk))
147
+ return [chunk.strip() for chunk in chunks if chunk.strip()]
148
+
149
+ text_splitter = EnhancedSentenceSplitter(
150
+ separator="。",
151
+ chunk_size=chunk_size,
152
+ chunk_overlap=chunk_overlap,
153
+ paragraph_separator="\n\n"
154
+ )
155
+
156
+ paragraphs = []
157
+ current_para = []
158
+ current_len = 0
159
+
160
+ for para in text.split("\n\n"):
161
+ para = para.strip()
162
+ para_len = len(para)
163
+ if para_len == 0:
164
+ continue
165
+ if current_len + para_len <= chunk_size:
166
+ current_para.append(para)
167
+ current_len += para_len
168
+ else:
169
+ if current_para:
170
+ paragraphs.append("\n".join(current_para))
171
+ current_para = [para]
172
+ current_len = para_len
173
+
174
+ if current_para:
175
+ paragraphs.append("\n".join(current_para))
176
+
177
+ chunk_data_list = []
178
+ chunk_id = 0
179
+ for para in paragraphs:
180
+ chunks = text_splitter.split_text(para)
181
+ for chunk in chunks:
182
+ if len(chunk) < 20:
183
+ continue
184
+ chunk_data_list.append({
185
+ "id": f'chunk{chunk_id}',
186
+ "chunk": chunk,
187
+ "method": "semantic_chunk"
188
+ })
189
+ chunk_id += 1
190
+ return chunk_data_list
191
+
192
+ # 构建Faiss索引
193
+ def build_faiss_index(vector_file, index_path, metadata_path):
194
+ try:
195
+ with open(vector_file, 'r', encoding='utf-8') as f:
196
+ data = json.load(f)
197
+
198
+ if not data:
199
+ raise ValueError("向量数据为空,请检查输入文件。")
200
+
201
+ # 确认所有数据项都有向量
202
+ valid_data = []
203
+ for item in data:
204
+ if 'vector' in item and item['vector']:
205
+ valid_data.append(item)
206
+ else:
207
+ print(f"警告: 跳过没有向量的数据项 ID: {item.get('id', '未知')}")
208
+
209
+ if not valid_data:
210
+ raise ValueError("没有找到任何有效的向量数据。")
211
+
212
+ # 提取向量
213
+ vectors = [item['vector'] for item in valid_data]
214
+ vectors = np.array(vectors, dtype=np.float32)
215
+
216
+ if vectors.size == 0:
217
+ raise ValueError("向量数组为空,转换失败。")
218
+
219
+ # 检查向量维度
220
+ dim = vectors.shape[1]
221
+ n_vectors = vectors.shape[0]
222
+ print(f"构建索引: {n_vectors} 个向量,每个向量维度: {dim}")
223
+
224
+ # 确定索引类型和参数
225
+ max_nlist = n_vectors // 39
226
+ nlist = min(max_nlist, 128) if max_nlist >= 1 else 1
227
+
228
+ if nlist >= 1 and n_vectors >= nlist * 39:
229
+ print(f"使用 IndexIVFFlat 索引,nlist={nlist}")
230
+ quantizer = faiss.IndexFlatIP(dim)
231
+ index = faiss.IndexIVFFlat(quantizer, dim, nlist)
232
+ if not index.is_trained:
233
+ index.train(vectors)
234
+ index.add(vectors)
235
+ else:
236
+ print(f"使用 IndexFlatIP 索引")
237
+ index = faiss.IndexFlatIP(dim)
238
+ index.add(vectors)
239
+
240
+ faiss.write_index(index, index_path)
241
+ print(f"成功写入索引到 {index_path}")
242
+
243
+ # 创建元数据
244
+ metadata = [{'id': item['id'], 'chunk': item['chunk'], 'method': item['method']} for item in valid_data]
245
+ with open(metadata_path, 'w', encoding='utf-8') as f:
246
+ json.dump(metadata, f, ensure_ascii=False, indent=4)
247
+ print(f"成功写入元数据到 {metadata_path}")
248
+
249
+ return True
250
+ except Exception as e:
251
+ print(f"构建索引失败: {str(e)}")
252
+ traceback.print_exc()
253
+ raise
254
+
255
+ # 向量化文件内容
256
+ def vectorize_file(data_list, output_file_path, field_name="chunk"):
257
+ """向量化文件内容,处理长度限制并确保输入有效"""
258
+ if not data_list:
259
+ print("警告: 没有数据需要向量化")
260
+ with open(output_file_path, 'w', encoding='utf-8') as outfile:
261
+ json.dump([], outfile, ensure_ascii=False, indent=4)
262
+ return
263
+
264
+ # 准备查询文本,确保每个文本有效且长度适中
265
+ valid_data = []
266
+ valid_texts = []
267
+
268
+ for data in data_list:
269
+ text = data.get(field_name, "")
270
+ # 确保文本不为空且长度合适
271
+ if text and 1 <= len(text) <= 8000: # 略小于API限制的8192,留出一些余量
272
+ valid_data.append(data)
273
+ valid_texts.append(text)
274
+ else:
275
+ # 如果文本太长,截断它
276
+ if len(text) > 8000:
277
+ truncated_text = text[:8000]
278
+ print(f"警告: 文本过长,已截断至8000字符。原始长度: {len(text)}")
279
+ data[field_name] = truncated_text
280
+ valid_data.append(data)
281
+ valid_texts.append(truncated_text)
282
+ else:
283
+ print(f"警告: 跳过空文本或长度为0的文本")
284
+
285
+ if not valid_texts:
286
+ print("错误: 所有文本都无效,无法进行向量化")
287
+ with open(output_file_path, 'w', encoding='utf-8') as outfile:
288
+ json.dump([], outfile, ensure_ascii=False, indent=4)
289
+ return
290
+
291
+ # 向量化有效文本
292
+ vectors = vectorize_query(valid_texts)
293
+
294
+ # 检查向量化是否成功
295
+ if vectors.size == 0 or len(vectors) != len(valid_data):
296
+ print(f"错误: 向量化失败或向量数量({len(vectors) if vectors.size > 0 else 0})与数据条目({len(valid_data)})不匹配")
297
+ # 保存原始数据,但不含向量
298
+ with open(output_file_path, 'w', encoding='utf-8') as outfile:
299
+ json.dump(valid_data, outfile, ensure_ascii=False, indent=4)
300
+ return
301
+
302
+ # 添加向量到数据中
303
+ for data, vector in zip(valid_data, vectors):
304
+ data['vector'] = vector.tolist()
305
+
306
+ # 保存结果
307
+ with open(output_file_path, 'w', encoding='utf-8') as outfile:
308
+ json.dump(valid_data, outfile, ensure_ascii=False, indent=4)
309
+
310
+ print(f"成功向量化 {len(valid_data)} 条数据并保存到 {output_file_path}")
311
+
312
+
313
+ # 向量化查询 - 通用函数,被多处使用
314
+ def vectorize_query(query, model_name=Config.model_name, batch_size=Config.batch_size) -> np.ndarray:
315
+ """向量化文本查询,返回嵌入向量,改进错误处理和批处理"""
316
+ embedding_client = OpenAI(
317
+ api_key=Config.api_key,
318
+ base_url=Config.base_url
319
+ )
320
+
321
+ if not query:
322
+ print("警告: 传入向量化的查询为空")
323
+ return np.array([])
324
+
325
+ if isinstance(query, str):
326
+ query = [query]
327
+
328
+ # 验证所有查询文本,确保它们符合API要求
329
+ valid_queries = []
330
+ for q in query:
331
+ if not q or not isinstance(q, str):
332
+ print(f"警告: 跳过无效查询: {type(q)}")
333
+ continue
334
+
335
+ # 清理文本并检查长度
336
+ clean_q = clean_text(q)
337
+ if not clean_q:
338
+ print("警告: 清理后的查询文本为空")
339
+ continue
340
+
341
+ # 检查长度是否在API限制范围内
342
+ if len(clean_q) > 8000:
343
+ print(f"警告: 查询文本过长 ({len(clean_q)} 字符),截断至 8000 字符")
344
+ clean_q = clean_q[:8000]
345
+
346
+ valid_queries.append(clean_q)
347
+
348
+ if not valid_queries:
349
+ print("错误: 所有查询都无效,无法进行向量化")
350
+ return np.array([])
351
+
352
+ # 分批处理有效查询
353
+ all_vectors = []
354
+ for i in range(0, len(valid_queries), batch_size):
355
+ batch = valid_queries[i:i + batch_size]
356
+ try:
357
+ # 记录批次信息便于调试
358
+ print(f"正在向量化批次 {i//batch_size + 1}/{(len(valid_queries)-1)//batch_size + 1}, "
359
+ f"包含 {len(batch)} 个文本,第一个文本长度: {len(batch[0][:50])}...")
360
+
361
+ completion = embedding_client.embeddings.create(
362
+ model=model_name,
363
+ input=batch,
364
+ dimensions=Config.dimensions,
365
+ encoding_format="float"
366
+ )
367
+ vectors = [embedding.embedding for embedding in completion.data]
368
+ all_vectors.extend(vectors)
369
+ print(f"批次 {i//batch_size + 1} 向量化成功,获得 {len(vectors)} 个向量")
370
+ except Exception as e:
371
+ print(f"向量化批次 {i//batch_size + 1} 失败:{str(e)}")
372
+ print(f"问题批次中的第一个文本: {batch[0][:100]}...")
373
+ traceback.print_exc()
374
+ # 如果是第一批就失败,直接返回空数组
375
+ if i == 0:
376
+ return np.array([])
377
+ # 否则返回已处理的向量
378
+ break
379
+
380
+ # 检查是否获得了任何向量
381
+ if not all_vectors:
382
+ print("错误: 向量化过程没有产生任何向量")
383
+ return np.array([])
384
+
385
+ return np.array(all_vectors)
386
+
387
+ # 简单的向量搜索,用于基本对比
388
+ def vector_search(query, index_path, metadata_path, limit):
389
+ """基本向量搜索函数"""
390
+ query_vector = vectorize_query(query)
391
+ if query_vector.size == 0:
392
+ return []
393
+
394
+ query_vector = np.array(query_vector, dtype=np.float32).reshape(1, -1)
395
+
396
+ index = faiss.read_index(index_path)
397
+ try:
398
+ with open(metadata_path, 'r', encoding='utf-8') as f:
399
+ metadata = json.load(f)
400
+ except UnicodeDecodeError:
401
+ print(f"警告:{metadata_path} 包含非法字符,使用 UTF-8 忽略错误重新加载")
402
+ with open(metadata_path, 'rb') as f:
403
+ content = f.read().decode('utf-8', errors='ignore')
404
+ metadata = json.loads(content)
405
+
406
+ D, I = index.search(query_vector, limit)
407
+ results = [metadata[i] for i in I[0] if i < len(metadata)]
408
+ return results
409
+
410
+ def clean_text(text):
411
+ """清理文本中的非法字符,控制文本长度"""
412
+ if not text:
413
+ return ""
414
+ # 移除控制字符,保留换行和制表符
415
+ text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
416
+ # 移除重复的空白字符
417
+ text = re.sub(r'\s+', ' ', text)
418
+ # 确保文本长度在合理范围内
419
+ return text.strip()
420
+
421
+ # PDF文本提取
422
+ def extract_text_from_pdf(pdf_path):
423
+ try:
424
+ doc = fitz.open(pdf_path)
425
+ text = ""
426
+ for page in doc:
427
+ page_text = page.get_text()
428
+ # 清理不可打印字符,尝试用 UTF-8 解码,失败时忽略非法字符
429
+ text += page_text.encode('utf-8', errors='ignore').decode('utf-8')
430
+ if not text.strip():
431
+ print(f"警告:PDF文件 {pdf_path} 提取内容为空")
432
+ return text
433
+ except Exception as e:
434
+ print(f"PDF文本提取失败:{str(e)}")
435
+ return ""
436
+
437
+ # 处理单个文件
438
+ def process_single_file(file_path: str) -> str:
439
+ try:
440
+ if file_path.lower().endswith('.pdf'):
441
+ text = extract_text_from_pdf(file_path)
442
+ if not text:
443
+ return f"PDF文件 {file_path} 内容为空或无法提取"
444
+ else:
445
+ with open(file_path, "rb") as f:
446
+ content = f.read()
447
+ result = chardet.detect(content)
448
+ detected_encoding = result['encoding']
449
+ confidence = result['confidence']
450
+
451
+ # 尝试多种编码方式
452
+ if detected_encoding and confidence > 0.7:
453
+ try:
454
+ text = content.decode(detected_encoding)
455
+ print(f"文件 {file_path} 使用检测到的编码 {detected_encoding} 解码成功")
456
+ except UnicodeDecodeError:
457
+ text = content.decode('utf-8', errors='ignore')
458
+ print(f"文件 {file_path} 使用 {detected_encoding} 解码失败,强制使用 UTF-8 忽略非法字符")
459
+ else:
460
+ # 尝试多种常见编码
461
+ encodings = ['utf-8', 'gbk', 'gb18030', 'gb2312', 'latin-1', 'utf-16', 'cp936', 'big5']
462
+ text = None
463
+ for encoding in encodings:
464
+ try:
465
+ text = content.decode(encoding)
466
+ print(f"文件 {file_path} 使用 {encoding} 解码成功")
467
+ break
468
+ except UnicodeDecodeError:
469
+ continue
470
+
471
+ # 如果所有编码都失败,使用忽略错误的方式解码
472
+ if text is None:
473
+ text = content.decode('utf-8', errors='ignore')
474
+ print(f"警告:文件 {file_path} 使用 UTF-8 忽略非法字符")
475
+
476
+ # 确保文本是干净的,移除非法字符
477
+ text = clean_text(text)
478
+ return text
479
+ except Exception as e:
480
+ print(f"处理文件 {file_path} 时出错: {str(e)}")
481
+ traceback.print_exc()
482
+ return f"处理文件 {file_path} 失败:{str(e)}"
483
+
484
+ # 批量处理并索引文件 - 修改为支持指定知识库
485
+ def process_and_index_files(file_objs: List, kb_name: str = DEFAULT_KB) -> str:
486
+ """处理并索引文件到指定的知识库"""
487
+ # 确保知识库目录存在
488
+ kb_dir = os.path.join(KB_BASE_DIR, kb_name)
489
+ os.makedirs(kb_dir, exist_ok=True)
490
+
491
+ # 设置临时处理文件路径
492
+ semantic_chunk_output = os.path.join(OUTPUT_DIR, "semantic_chunk_output.json")
493
+ semantic_chunk_vector = os.path.join(OUTPUT_DIR, "semantic_chunk_vector.json")
494
+
495
+ # 设置知识库索引文件路径
496
+ semantic_chunk_index = os.path.join(kb_dir, "semantic_chunk.index")
497
+ semantic_chunk_metadata = os.path.join(kb_dir, "semantic_chunk_metadata.json")
498
+
499
+ all_chunks = []
500
+ error_messages = []
501
+ try:
502
+ if not file_objs or len(file_objs) == 0:
503
+ return "错误:没有选择任何文件"
504
+
505
+ print(f"开始处理 {len(file_objs)} 个文件,目标知识库: {kb_name}...")
506
+ with ThreadPoolExecutor(max_workers=4) as executor:
507
+ future_to_file = {executor.submit(process_single_file, file_obj.name): file_obj for file_obj in file_objs}
508
+ for future in as_completed(future_to_file):
509
+ result = future.result()
510
+ file_obj = future_to_file[future]
511
+ file_name = file_obj.name
512
+
513
+ if isinstance(result, str) and result.startswith("处理文件"):
514
+ error_messages.append(result)
515
+ print(result)
516
+ continue
517
+
518
+ # 检查结果是否为有效文本
519
+ if not result or not isinstance(result, str) or len(result.strip()) == 0:
520
+ error_messages.append(f"文件 {file_name} 处理后内容为空")
521
+ print(f"警告: 文件 {file_name} 处理后内容为空")
522
+ continue
523
+
524
+ print(f"对文件 {file_name} 进行语义分块...")
525
+ chunks = semantic_chunk(result)
526
+
527
+ if not chunks or len(chunks) == 0:
528
+ error_messages.append(f"文件 {file_name} 无法生成任何分块")
529
+ print(f"警告: 文件 {file_name} 无法生成任何分块")
530
+ continue
531
+
532
+ # 将处理后的文件保存到知识库目录
533
+ file_basename = os.path.basename(file_name)
534
+ dest_file_path = os.path.join(kb_dir, file_basename)
535
+ try:
536
+ shutil.copy2(file_name, dest_file_path)
537
+ print(f"已将文件 {file_basename} 复制到知识库 {kb_name}")
538
+ except Exception as e:
539
+ print(f"复制文件到知识库失败: {str(e)}")
540
+
541
+ all_chunks.extend(chunks)
542
+ print(f"文件 {file_name} 处理完成,生成 {len(chunks)} 个分块")
543
+
544
+ if not all_chunks:
545
+ return "所有文件处理失败或内容为空\n" + "\n".join(error_messages)
546
+
547
+ # 确保分块内容干净且长度合适
548
+ valid_chunks = []
549
+ for chunk in all_chunks:
550
+ # 深度清理文本
551
+ clean_chunk_text = clean_text(chunk["chunk"])
552
+
553
+ # 检查清理后的文本是否有效
554
+ if clean_chunk_text and 1 <= len(clean_chunk_text) <= 8000:
555
+ chunk["chunk"] = clean_chunk_text
556
+ valid_chunks.append(chunk)
557
+ elif len(clean_chunk_text) > 8000:
558
+ # 如果文本太长,截断它
559
+ chunk["chunk"] = clean_chunk_text[:8000]
560
+ valid_chunks.append(chunk)
561
+ print(f"警告: 分块 {chunk['id']} 过长已被截断")
562
+ else:
563
+ print(f"警告: 跳过无效分块 {chunk['id']}")
564
+
565
+ if not valid_chunks:
566
+ return "所有生成的分块内容无效或为空\n" + "\n".join(error_messages)
567
+
568
+ print(f"处理了 {len(all_chunks)} 个分块,有效分块数: {len(valid_chunks)}")
569
+
570
+ # 保存语义分块
571
+ with open(semantic_chunk_output, 'w', encoding='utf-8') as json_file:
572
+ json.dump(valid_chunks, json_file, ensure_ascii=False, indent=4)
573
+ print(f"语义分块完成: {semantic_chunk_output}")
574
+
575
+ # 向量化语义分块
576
+ print(f"开始向量化 {len(valid_chunks)} 个分块...")
577
+ vectorize_file(valid_chunks, semantic_chunk_vector)
578
+ print(f"语义分块向量化完成: {semantic_chunk_vector}")
579
+
580
+ # 验证向量文件是否有效
581
+ try:
582
+ with open(semantic_chunk_vector, 'r', encoding='utf-8') as f:
583
+ vector_data = json.load(f)
584
+
585
+ if not vector_data or len(vector_data) == 0:
586
+ return f"向量化失败: 生成的向量文件为空\n" + "\n".join(error_messages)
587
+
588
+ # 检查向量数据结构
589
+ if 'vector' not in vector_data[0]:
590
+ return f"向量化失败: 数据中缺少向量字段\n" + "\n".join(error_messages)
591
+
592
+ print(f"成功生成 {len(vector_data)} 个向量")
593
+ except Exception as e:
594
+ return f"读取向量文件失败: {str(e)}\n" + "\n".join(error_messages)
595
+
596
+ # 构建索引
597
+ print(f"开始为知识库 {kb_name} 构建索引...")
598
+ build_faiss_index(semantic_chunk_vector, semantic_chunk_index, semantic_chunk_metadata)
599
+ print(f"知识库 {kb_name} 索引构建完成: {semantic_chunk_index}")
600
+
601
+ status = f"知识库 {kb_name} 更新成功!共处理 {len(valid_chunks)} 个有效分块。\n"
602
+ if error_messages:
603
+ status += "以下文件处理过程中出现问题:\n" + "\n".join(error_messages)
604
+ return status
605
+ except Exception as e:
606
+ error = f"知识库 {kb_name} 索引构建过程中出错:{str(e)}"
607
+ print(error)
608
+ traceback.print_exc()
609
+ return error + "\n" + "\n".join(error_messages)
610
+
611
+ # 核心联网搜索功能
612
+ def get_search_background(query: str, max_length: int = 1500) -> str:
613
+ try:
614
+ from retrievor import q_searching
615
+ search_results = q_searching(query)
616
+ cleaned_results = re.sub(r'\s+', ' ', search_results).strip()
617
+ return cleaned_results[:max_length]
618
+ except Exception as e:
619
+ print(f"联网搜索失败:{str(e)}")
620
+ return ""
621
+
622
+ # 基本的回答生成
623
+ def generate_answer_from_deepseek(question: str, system_prompt: str = "你是一名专业医疗助手,请根据背景知识回答问题。", background_info: Optional[str] = None) -> str:
624
+ deepseek_client = DeepSeekClient()
625
+ user_prompt = f"问题:{question}"
626
+ if background_info:
627
+ user_prompt = f"背景知识:{background_info}\n\n{user_prompt}"
628
+ try:
629
+ answer = deepseek_client.generate_answer(system_prompt, user_prompt)
630
+ return answer
631
+ except Exception as e:
632
+ return f"生成回答时出错:{str(e)}"
633
+
634
+ # 多跳推理RAG系统 - 核心创新点
635
+ class ReasoningRAG:
636
+ """
637
+ 多跳推理RAG系统,通过迭代式的检索和推理过程回答问题,支持流式响应
638
+ """
639
+
640
+ def __init__(self,
641
+ index_path: str,
642
+ metadata_path: str,
643
+ max_hops: int = 3,
644
+ initial_candidates: int = 5,
645
+ refined_candidates: int = 3,
646
+ reasoning_model: str = Config.llm_model,
647
+ verbose: bool = False):
648
+ """
649
+ 初始化推理RAG系统
650
+
651
+ 参数:
652
+ index_path: FAISS索引的路径
653
+ metadata_path: 元数据JSON文件的路径
654
+ max_hops: 最大推理-检索跳数
655
+ initial_candidates: 初始检索候选数量
656
+ refined_candidates: 精炼检索候选数量
657
+ reasoning_model: 用于推理步骤的LLM模型
658
+ verbose: 是否打印详细日志
659
+ """
660
+ self.index_path = index_path
661
+ self.metadata_path = metadata_path
662
+ self.max_hops = max_hops
663
+ self.initial_candidates = initial_candidates
664
+ self.refined_candidates = refined_candidates
665
+ self.reasoning_model = reasoning_model
666
+ self.verbose = verbose
667
+
668
+ # 加载索引和元数据
669
+ self._load_resources()
670
+
671
+ def _load_resources(self):
672
+ """加载FAISS索引和元数据"""
673
+ if os.path.exists(self.index_path) and os.path.exists(self.metadata_path):
674
+ self.index = faiss.read_index(self.index_path)
675
+ try:
676
+ with open(self.metadata_path, 'r', encoding='utf-8') as f:
677
+ self.metadata = json.load(f)
678
+ except UnicodeDecodeError:
679
+ with open(self.metadata_path, 'rb') as f:
680
+ content = f.read().decode('utf-8', errors='ignore')
681
+ self.metadata = json.loads(content)
682
+ else:
683
+ raise FileNotFoundError(f"Index or metadata not found at {self.index_path} or {self.metadata_path}")
684
+
685
+ def _vectorize_query(self, query: str) -> np.ndarray:
686
+ """将查询转换为向量"""
687
+ return vectorize_query(query).reshape(1, -1)
688
+
689
+ def _retrieve(self, query_vector: np.ndarray, limit: int) -> List[Dict[str, Any]]:
690
+ """使用向量相似性检索块"""
691
+ if query_vector.size == 0:
692
+ return []
693
+
694
+ D, I = self.index.search(query_vector, limit)
695
+ results = [self.metadata[i] for i in I[0] if i < len(self.metadata)]
696
+ return results
697
+
698
+ def _generate_reasoning(self,
699
+ query: str,
700
+ retrieved_chunks: List[Dict[str, Any]],
701
+ previous_queries: List[str] = None,
702
+ hop_number: int = 0) -> Dict[str, Any]:
703
+ """
704
+ 为检索到的信息生成推理分析并识别信息缺口
705
+
706
+ 返回包含以下字段的字典:
707
+ - analysis: 对当前信息的推理分析
708
+ - missing_info: 已识别的缺失信息
709
+ - follow_up_queries: 填补信息缺口的后续查询列表
710
+ - is_sufficient: 表示信息是否足够的布尔值
711
+ """
712
+ if previous_queries is None:
713
+ previous_queries = []
714
+
715
+ # 为模型准备上下文
716
+ chunks_text = "\n\n".join([f"[Chunk {i+1}]: {chunk['chunk']}"
717
+ for i, chunk in enumerate(retrieved_chunks)])
718
+
719
+ previous_queries_text = "\n".join([f"Q{i+1}: {q}" for i, q in enumerate(previous_queries)])
720
+
721
+ system_prompt = """
722
+ 你是医疗信息检索的专家分析系统。
723
+ 你的任务是分析检索到的信息块,识别缺失的内容,并提出有针对性的后续查询来填补信息缺口。
724
+
725
+ 重点关注医疗领域知识,如:
726
+ - 疾病诊断和症状
727
+ - 治疗方法和药物
728
+ - 医学研究和临床试验
729
+ - 患者护理和康复
730
+ - 医疗法规和伦理
731
+ """
732
+
733
+ user_prompt = f"""
734
+ ## 原始查询
735
+ {query}
736
+
737
+ ## 先前查询(如果有)
738
+ {previous_queries_text if previous_queries else "无"}
739
+
740
+ ## 检索到的信息(跳数 {hop_number})
741
+ {chunks_text if chunks_text else "未检索到信息。"}
742
+
743
+ ## 你的任务
744
+ 1. 分析已检索到的信息与原始查询的关系
745
+ 2. 确定能够更完整回答查询的特定缺失信息
746
+ 3. 提出1-3个针对性的后续查询,以检索缺失信息
747
+ 4. 确定当前信息是否足够回答原始查询
748
+
749
+ 以JSON格式回答,包含以下字段:
750
+ - analysis: 对当前信息的详细分析
751
+ - missing_info: 特定缺失信息的列表
752
+ - follow_up_queries: 1-3个具体的后续查询
753
+ - is_sufficient: 表示信息是否足够的布尔值
754
+ """
755
+
756
+ try:
757
+ response = client.chat.completions.create(
758
+ model=Config.llm_model,
759
+ messages=[
760
+ {"role": "system", "content": system_prompt},
761
+ {"role": "user", "content": user_prompt}
762
+ ],
763
+ response_format={"type": "json_object"}
764
+ )
765
+ reasoning_text = response.choices[0].message.content.strip()
766
+
767
+ # 解析JSON响应
768
+ try:
769
+ reasoning = json.loads(reasoning_text)
770
+ # 确保预期的键存在
771
+ required_keys = ["analysis", "missing_info", "follow_up_queries", "is_sufficient"]
772
+ for key in required_keys:
773
+ if key not in reasoning:
774
+ reasoning[key] = [] if key != "is_sufficient" else False
775
+ return reasoning
776
+ except json.JSONDecodeError:
777
+ # 如果JSON解析失败,则回退
778
+ if self.verbose:
779
+ print(f"无法从模型输出解析JSON: {reasoning_text[:100]}...")
780
+ return {
781
+ "analysis": "无法分析检索到的信息。",
782
+ "missing_info": ["无法识别缺失信息"],
783
+ "follow_up_queries": [],
784
+ "is_sufficient": False
785
+ }
786
+
787
+ except Exception as e:
788
+ if self.verbose:
789
+ print(f"推理生成错误: {e}")
790
+ print(traceback.format_exc())
791
+ return {
792
+ "analysis": "分析过程出错。",
793
+ "missing_info": [],
794
+ "follow_up_queries": [],
795
+ "is_sufficient": False
796
+ }
797
+
798
+ def _synthesize_answer(self,
799
+ query: str,
800
+ all_chunks: List[Dict[str, Any]],
801
+ reasoning_steps: List[Dict[str, Any]],
802
+ use_table_format: bool = False) -> str:
803
+ """从所有检索到的块和推理步骤中合成最终答案"""
804
+ # 合并所有块,去除重复
805
+ unique_chunks = []
806
+ chunk_ids = set()
807
+ for chunk in all_chunks:
808
+ if chunk["id"] not in chunk_ids:
809
+ unique_chunks.append(chunk)
810
+ chunk_ids.add(chunk["id"])
811
+
812
+ # 准备上下文
813
+ chunks_text = "\n\n".join([f"[Chunk {i+1}]: {chunk['chunk']}"
814
+ for i, chunk in enumerate(unique_chunks)])
815
+
816
+ # 准备推理跟踪
817
+ reasoning_trace = ""
818
+ for i, step in enumerate(reasoning_steps):
819
+ reasoning_trace += f"\n\n推理步骤 {i+1}:\n"
820
+ reasoning_trace += f"分析: {step['analysis']}\n"
821
+ reasoning_trace += f"缺失信息: {', '.join(step['missing_info'])}\n"
822
+ reasoning_trace += f"后续查询: {', '.join(step['follow_up_queries'])}"
823
+
824
+ system_prompt = """
825
+ 你是医疗领域的专家。基于检索到的信息块,为用户的查询合成一个全面的答案。
826
+
827
+ 重点提供有关医疗和健康的准确、基于证据的信息,包括诊断、治疗、预防和医学研究等方面。
828
+
829
+ 逻辑地组织你的答案,并在适当时引用块中的具体信息。如果信息不完整,请承认限制。
830
+ """
831
+
832
+ output_format_instruction = ""
833
+ if use_table_format:
834
+ output_format_instruction = """
835
+ 请尽可能以Markdown表格格式组织你的回答。如果信息适合表格形式展示,请使用表格;
836
+ 如果不适合表格形式,可以先用文本介绍,然后再使用表格总结关键信息。
837
+
838
+ 表格语法示例:
839
+ | 标题1 | 标题2 | 标题3 |
840
+ | ----- | ----- | ----- |
841
+ | 内容1 | 内容2 | 内容3 |
842
+
843
+ 确保表格格式符合Markdown标准,以便正确渲染。
844
+ """
845
+
846
+ user_prompt = f"""
847
+ ## 原始查询
848
+ {query}
849
+
850
+ ## 检索到的信息块
851
+ {chunks_text}
852
+
853
+ ## 推理过程
854
+ {reasoning_trace}
855
+
856
+ ## 你的任务
857
+ 使用提供的信息块为原始查询合成一个全面的答案。你的答案应该:
858
+
859
+ 1. 直接回应查询
860
+ 2. 结构清晰,易于理解
861
+ 3. 基于检索到的信息
862
+ 4. 承认可用信息中的任何重大缺口
863
+
864
+ {output_format_instruction}
865
+
866
+ 以直接回应提出原始查询的用户的方式呈现你的答案。
867
+ """
868
+
869
+ try:
870
+ response = client.chat.completions.create(
871
+ model=Config.llm_model,
872
+ messages=[
873
+ {"role": "system", "content": system_prompt},
874
+ {"role": "user", "content": user_prompt}
875
+ ]
876
+ )
877
+ return response.choices[0].message.content.strip()
878
+ except Exception as e:
879
+ if self.verbose:
880
+ print(f"答案合成错误: {e}")
881
+ print(traceback.format_exc())
882
+ return "由于出错,无法生成答案。"
883
+
884
+ def stream_retrieve_and_answer(self, query: str, use_table_format: bool = False):
885
+ """
886
+ 执行多跳检索和回答生成的流式方法,逐步返回结果
887
+
888
+ 这是一个生成器函数,会在处理的每个阶段产生中间结果
889
+ """
890
+ all_chunks = []
891
+ all_queries = [query]
892
+ reasoning_steps = []
893
+
894
+ # 生成状态更新
895
+ yield {
896
+ "status": "正在将查询向量化...",
897
+ "reasoning_display": "",
898
+ "answer": None,
899
+ "all_chunks": [],
900
+ "reasoning_steps": []
901
+ }
902
+
903
+ # 初始检索
904
+ try:
905
+ query_vector = self._vectorize_query(query)
906
+ if query_vector.size == 0:
907
+ yield {
908
+ "status": "向量化失败",
909
+ "reasoning_display": "由于嵌入错误,无法处理查询。",
910
+ "answer": "由于嵌入错误,无法处理查询。",
911
+ "all_chunks": [],
912
+ "reasoning_steps": []
913
+ }
914
+ return
915
+
916
+ yield {
917
+ "status": "正在执行初始检索...",
918
+ "reasoning_display": "",
919
+ "answer": None,
920
+ "all_chunks": [],
921
+ "reasoning_steps": []
922
+ }
923
+
924
+ initial_chunks = self._retrieve(query_vector, self.initial_candidates)
925
+ all_chunks.extend(initial_chunks)
926
+
927
+ if not initial_chunks:
928
+ yield {
929
+ "status": "未找到相关信息",
930
+ "reasoning_display": "未找到与您的查询相关的信息。",
931
+ "answer": "未找到与您的查询相关的信息。",
932
+ "all_chunks": [],
933
+ "reasoning_steps": []
934
+ }
935
+ return
936
+
937
+ # 更新状态,展示找到的初始块
938
+ chunks_preview = "\n".join([f"- {chunk['chunk'][:100]}..." for chunk in initial_chunks[:2]])
939
+ yield {
940
+ "status": f"找到 {len(initial_chunks)} 个相关信息块,正在生成初步分析...",
941
+ "reasoning_display": f"### 检索到的初始信息\n{chunks_preview}\n\n### 正在分析...",
942
+ "answer": None,
943
+ "all_chunks": all_chunks,
944
+ "reasoning_steps": []
945
+ }
946
+
947
+ # 初始推理
948
+ reasoning = self._generate_reasoning(query, initial_chunks, hop_number=0)
949
+ reasoning_steps.append(reasoning)
950
+
951
+ # 生成当前的推理显示
952
+ reasoning_display = "### 多跳推理过程\n"
953
+ reasoning_display += f"**推理步骤 1**\n"
954
+ reasoning_display += f"- 分析: {reasoning['analysis'][:200]}...\n"
955
+ reasoning_display += f"- 缺失信息: {', '.join(reasoning['missing_info'])}\n"
956
+ if reasoning['follow_up_queries']:
957
+ reasoning_display += f"- 后续查询: {', '.join(reasoning['follow_up_queries'])}\n"
958
+ reasoning_display += f"- 信息是否足够: {'是' if reasoning['is_sufficient'] else '否'}\n\n"
959
+
960
+ yield {
961
+ "status": "初步分析完成",
962
+ "reasoning_display": reasoning_display,
963
+ "answer": None,
964
+ "all_chunks": all_chunks,
965
+ "reasoning_steps": reasoning_steps
966
+ }
967
+
968
+ # 检查是否需要额外的跳数
969
+ hop = 1
970
+ while (hop < self.max_hops and
971
+ not reasoning["is_sufficient"] and
972
+ reasoning["follow_up_queries"]):
973
+
974
+ follow_up_status = f"执行跳数 {hop},正在处理 {len(reasoning['follow_up_queries'])} 个后续查询..."
975
+ yield {
976
+ "status": follow_up_status,
977
+ "reasoning_display": reasoning_display + f"\n\n### {follow_up_status}",
978
+ "answer": None,
979
+ "all_chunks": all_chunks,
980
+ "reasoning_steps": reasoning_steps
981
+ }
982
+
983
+ hop_chunks = []
984
+
985
+ # 处理每个后续查询
986
+ for i, follow_up_query in enumerate(reasoning["follow_up_queries"]):
987
+ all_queries.append(follow_up_query)
988
+
989
+ query_status = f"处理后续查询 {i+1}/{len(reasoning['follow_up_queries'])}: {follow_up_query}"
990
+ yield {
991
+ "status": query_status,
992
+ "reasoning_display": reasoning_display + f"\n\n### {query_status}",
993
+ "answer": None,
994
+ "all_chunks": all_chunks,
995
+ "reasoning_steps": reasoning_steps
996
+ }
997
+
998
+ # 为后续查询检索
999
+ follow_up_vector = self._vectorize_query(follow_up_query)
1000
+ if follow_up_vector.size > 0:
1001
+ follow_up_chunks = self._retrieve(follow_up_vector, self.refined_candidates)
1002
+ hop_chunks.extend(follow_up_chunks)
1003
+ all_chunks.extend(follow_up_chunks)
1004
+
1005
+ # 更新状态,显示新找到的块数量
1006
+ yield {
1007
+ "status": f"查询 '{follow_up_query}' 找到了 {len(follow_up_chunks)} 个相关块",
1008
+ "reasoning_display": reasoning_display + f"\n\n为查询 '{follow_up_query}' 找到了 {len(follow_up_chunks)} 个相关块",
1009
+ "answer": None,
1010
+ "all_chunks": all_chunks,
1011
+ "reasoning_steps": reasoning_steps
1012
+ }
1013
+
1014
+ # 为此跳数生成推理
1015
+ yield {
1016
+ "status": f"正在为跳数 {hop} 生成推理分析...",
1017
+ "reasoning_display": reasoning_display + f"\n\n### 正在为跳数 {hop} 生成推理分析...",
1018
+ "answer": None,
1019
+ "all_chunks": all_chunks,
1020
+ "reasoning_steps": reasoning_steps
1021
+ }
1022
+
1023
+ reasoning = self._generate_reasoning(
1024
+ query,
1025
+ hop_chunks,
1026
+ previous_queries=all_queries[:-1],
1027
+ hop_number=hop
1028
+ )
1029
+ reasoning_steps.append(reasoning)
1030
+
1031
+ # 更新推理显示
1032
+ reasoning_display += f"\n**推理步骤 {hop+1}**\n"
1033
+ reasoning_display += f"- 分析: {reasoning['analysis'][:200]}...\n"
1034
+ reasoning_display += f"- 缺失信息: {', '.join(reasoning['missing_info'])}\n"
1035
+ if reasoning['follow_up_queries']:
1036
+ reasoning_display += f"- 后续查询: {', '.join(reasoning['follow_up_queries'])}\n"
1037
+ reasoning_display += f"- 信息是否足够: {'是' if reasoning['is_sufficient'] else '否'}\n"
1038
+
1039
+ yield {
1040
+ "status": f"跳数 {hop} 完成",
1041
+ "reasoning_display": reasoning_display,
1042
+ "answer": None,
1043
+ "all_chunks": all_chunks,
1044
+ "reasoning_steps": reasoning_steps
1045
+ }
1046
+
1047
+ hop += 1
1048
+
1049
+ # 合成最终答案
1050
+ yield {
1051
+ "status": "正在合成最终答案...",
1052
+ "reasoning_display": reasoning_display + "\n\n### 正在合成最终答案...",
1053
+ "answer": "正在处理您的问题,请稍候...",
1054
+ "all_chunks": all_chunks,
1055
+ "reasoning_steps": reasoning_steps
1056
+ }
1057
+
1058
+ answer = self._synthesize_answer(query, all_chunks, reasoning_steps, use_table_format)
1059
+
1060
+ # 为最终显示准备检索内容汇总
1061
+ all_chunks_summary = "\n\n".join([f"**检索块 {i+1}**:\n{chunk['chunk']}"
1062
+ for i, chunk in enumerate(all_chunks[:10])]) # 限制显示前10个块
1063
+
1064
+ if len(all_chunks) > 10:
1065
+ all_chunks_summary += f"\n\n...以及另外 {len(all_chunks) - 10} 个块(总计 {len(all_chunks)} 个)"
1066
+
1067
+ enhanced_display = reasoning_display + "\n\n### 检索到的内容\n" + all_chunks_summary + "\n\n### 回答已生成"
1068
+
1069
+ yield {
1070
+ "status": "回答已生成",
1071
+ "reasoning_display": enhanced_display,
1072
+ "answer": answer,
1073
+ "all_chunks": all_chunks,
1074
+ "reasoning_steps": reasoning_steps
1075
+ }
1076
+
1077
+ except Exception as e:
1078
+ error_msg = f"处理过程中出错: {str(e)}"
1079
+ if self.verbose:
1080
+ print(error_msg)
1081
+ print(traceback.format_exc())
1082
+
1083
+ yield {
1084
+ "status": "处理出错",
1085
+ "reasoning_display": error_msg,
1086
+ "answer": f"处理您的问题时遇到错误: {str(e)}",
1087
+ "all_chunks": all_chunks,
1088
+ "reasoning_steps": reasoning_steps
1089
+ }
1090
+
1091
+ def retrieve_and_answer(self, query: str, use_table_format: bool = False) -> Tuple[str, Dict[str, Any]]:
1092
+ """
1093
+ 执行多跳检索和回答生成的主要方法
1094
+
1095
+ 返回:
1096
+ 包含以下内容的元组:
1097
+ - 最终答案
1098
+ - 包含推理步骤和所有检索到的块的调试字典
1099
+ """
1100
+ all_chunks = []
1101
+ all_queries = [query]
1102
+ reasoning_steps = []
1103
+ debug_info = {"reasoning_steps": [], "all_chunks": [], "all_queries": all_queries}
1104
+
1105
+ # 初始检索
1106
+ query_vector = self._vectorize_query(query)
1107
+ if query_vector.size == 0:
1108
+ return "由于嵌入错误,无法处理查询。", debug_info
1109
+
1110
+ initial_chunks = self._retrieve(query_vector, self.initial_candidates)
1111
+ all_chunks.extend(initial_chunks)
1112
+ debug_info["all_chunks"].extend(initial_chunks)
1113
+
1114
+ if not initial_chunks:
1115
+ return "未找到与您的查询相关的信息。", debug_info
1116
+
1117
+ # 初始推理
1118
+ reasoning = self._generate_reasoning(query, initial_chunks, hop_number=0)
1119
+ reasoning_steps.append(reasoning)
1120
+ debug_info["reasoning_steps"].append(reasoning)
1121
+
1122
+ # 检查是否需要额外的跳数
1123
+ hop = 1
1124
+ while (hop < self.max_hops and
1125
+ not reasoning["is_sufficient"] and
1126
+ reasoning["follow_up_queries"]):
1127
+
1128
+ if self.verbose:
1129
+ print(f"开始跳数 {hop},有 {len(reasoning['follow_up_queries'])} 个后续查询")
1130
+
1131
+ hop_chunks = []
1132
+
1133
+ # 处理每个后续查询
1134
+ for follow_up_query in reasoning["follow_up_queries"]:
1135
+ all_queries.append(follow_up_query)
1136
+ debug_info["all_queries"].append(follow_up_query)
1137
+
1138
+ # 为后续查询检索
1139
+ follow_up_vector = self._vectorize_query(follow_up_query)
1140
+ if follow_up_vector.size > 0:
1141
+ follow_up_chunks = self._retrieve(follow_up_vector, self.refined_candidates)
1142
+ hop_chunks.extend(follow_up_chunks)
1143
+ all_chunks.extend(follow_up_chunks)
1144
+ debug_info["all_chunks"].extend(follow_up_chunks)
1145
+
1146
+ # 为此跳数生成推理
1147
+ reasoning = self._generate_reasoning(
1148
+ query,
1149
+ hop_chunks,
1150
+ previous_queries=all_queries[:-1],
1151
+ hop_number=hop
1152
+ )
1153
+ reasoning_steps.append(reasoning)
1154
+ debug_info["reasoning_steps"].append(reasoning)
1155
+
1156
+ hop += 1
1157
+
1158
+ # 合成最终答案
1159
+ answer = self._synthesize_answer(query, all_chunks, reasoning_steps, use_table_format)
1160
+
1161
+ return answer, debug_info
1162
+
1163
+ # 基于选定知识库生成索引路径
1164
+ def get_kb_paths(kb_name: str) -> Dict[str, str]:
1165
+ """获取指定知识库的索引文件路径"""
1166
+ kb_dir = os.path.join(KB_BASE_DIR, kb_name)
1167
+ return {
1168
+ "index_path": os.path.join(kb_dir, "semantic_chunk.index"),
1169
+ "metadata_path": os.path.join(kb_dir, "semantic_chunk_metadata.json")
1170
+ }
1171
+
1172
+ def multi_hop_generate_answer(query: str, kb_name: str, use_table_format: bool = False, system_prompt: str = "你是一名医疗专家。") -> Tuple[str, Dict]:
1173
+ """使用多跳推理RAG生成答案,基于指定知识库"""
1174
+ kb_paths = get_kb_paths(kb_name)
1175
+
1176
+ reasoning_rag = ReasoningRAG(
1177
+ index_path=kb_paths["index_path"],
1178
+ metadata_path=kb_paths["metadata_path"],
1179
+ max_hops=3,
1180
+ initial_candidates=5,
1181
+ refined_candidates=3,
1182
+ reasoning_model=Config.llm_model,
1183
+ verbose=True
1184
+ )
1185
+
1186
+ answer, debug_info = reasoning_rag.retrieve_and_answer(query, use_table_format)
1187
+ return answer, debug_info
1188
+
1189
+ # 使用简单向量检索生成答案,基于指定知识库
1190
+ def simple_generate_answer(query: str, kb_name: str, use_table_format: bool = False) -> str:
1191
+ """使用简单的向量检索生成答案,不使用多跳推理"""
1192
+ try:
1193
+ kb_paths = get_kb_paths(kb_name)
1194
+
1195
+ # 使用基本向量搜索
1196
+ search_results = vector_search(query, kb_paths["index_path"], kb_paths["metadata_path"], limit=5)
1197
+
1198
+ if not search_results:
1199
+ return "未找到相关信息。"
1200
+
1201
+ # 准备背景信息
1202
+ background_chunks = "\n\n".join([f"[相关信息 {i+1}]: {result['chunk']}"
1203
+ for i, result in enumerate(search_results)])
1204
+
1205
+ # 生成答案
1206
+ system_prompt = "你是一名医疗专家。基于提供的背景信息回答用户的问题。"
1207
+
1208
+ if use_table_format:
1209
+ system_prompt += "请尽可能以Markdown表格的形式呈现结构化信息。"
1210
+
1211
+ user_prompt = f"""
1212
+ 问题:{query}
1213
+
1214
+ 背景信息:
1215
+ {background_chunks}
1216
+
1217
+ 请基于以上背景信息回答用户的问题。
1218
+ """
1219
+
1220
+ response = client.chat.completions.create(
1221
+ model=Config.llm_model,
1222
+ messages=[
1223
+ {"role": "system", "content": system_prompt},
1224
+ {"role": "user", "content": user_prompt}
1225
+ ]
1226
+ )
1227
+
1228
+ return response.choices[0].message.content.strip()
1229
+
1230
+ except Exception as e:
1231
+ return f"生成答案时出错:{str(e)}"
1232
+
1233
+ # 修改主要的问题处理函数以支持指定知识库
1234
+ def ask_question_parallel(question: str, kb_name: str = DEFAULT_KB, use_search: bool = True, use_table_format: bool = False, multi_hop: bool = False) -> str:
1235
+ """基于指定知识库回答问题"""
1236
+ try:
1237
+ kb_paths = get_kb_paths(kb_name)
1238
+ index_path = kb_paths["index_path"]
1239
+ metadata_path = kb_paths["metadata_path"]
1240
+
1241
+ search_background = ""
1242
+ local_answer = ""
1243
+ debug_info = {}
1244
+
1245
+ # 并行处理
1246
+ with ThreadPoolExecutor(max_workers=2) as executor:
1247
+ futures = {}
1248
+
1249
+ if use_search:
1250
+ futures[executor.submit(get_search_background, question)] = "search"
1251
+
1252
+ if os.path.exists(index_path):
1253
+ if multi_hop:
1254
+ # 使用多跳推理
1255
+ futures[executor.submit(multi_hop_generate_answer, question, kb_name, use_table_format)] = "rag"
1256
+ else:
1257
+ # 使用简单向量检索
1258
+ futures[executor.submit(simple_generate_answer, question, kb_name, use_table_format)] = "simple"
1259
+
1260
+ for future in as_completed(futures):
1261
+ result = future.result()
1262
+ if futures[future] == "search":
1263
+ search_background = result or ""
1264
+ elif futures[future] == "rag":
1265
+ local_answer, debug_info = result
1266
+ elif futures[future] == "simple":
1267
+ local_answer = result
1268
+
1269
+ # 如果同时有搜索和本地结果,合并它们
1270
+ if search_background and local_answer:
1271
+ system_prompt = "你是一名医疗专家,请整合网络搜索和本地知识库提供全面的解答。"
1272
+
1273
+ table_instruction = ""
1274
+ if use_table_format:
1275
+ table_instruction = """
1276
+ 请尽可能以Markdown表格的形式呈现你的回答,特别是对于症状、治疗方法、药物等结构化信息。
1277
+
1278
+ 请确保你的表格遵循正确的Markdown语法:
1279
+ | 列标题1 | 列标题2 | 列标题3 |
1280
+ | ------- | ------- | ------- |
1281
+ | 数据1 | 数据2 | 数据3 |
1282
+ """
1283
+
1284
+ user_prompt = f"""
1285
+ 问题:{question}
1286
+
1287
+ 网络搜索结果:{search_background}
1288
+
1289
+ 本地知识库分析:{local_answer}
1290
+
1291
+ {table_instruction}
1292
+
1293
+ 请根据以上信息,提供一个综合的回答。
1294
+ """
1295
+
1296
+ try:
1297
+ response = client.chat.completions.create(
1298
+ model="qwen-plus",
1299
+ messages=[
1300
+ {"role": "system", "content": system_prompt},
1301
+ {"role": "user", "content": user_prompt}
1302
+ ]
1303
+ )
1304
+ combined_answer = response.choices[0].message.content.strip()
1305
+ return combined_answer
1306
+ except Exception as e:
1307
+ # 如果合并失败,回退到本地答案
1308
+ return local_answer
1309
+ elif local_answer:
1310
+ return local_answer
1311
+ elif search_background:
1312
+ # 仅从搜索结果生成答案
1313
+ system_prompt = "你是一名医疗专家。"
1314
+ if use_table_format:
1315
+ system_prompt += "请尽可能以Markdown表格的形式呈现结构化信息。"
1316
+ return generate_answer_from_deepseek(question, system_prompt=system_prompt, background_info=f"[联网搜索结果]:{search_background}")
1317
+ else:
1318
+ return "未找到相关信息。"
1319
+
1320
+ except Exception as e:
1321
+ return f"查询失败:{str(e)}"
1322
+
1323
+ # 修改以支持多知识库的流式响应函数
1324
+ def process_question_with_reasoning(question: str, kb_name: str = DEFAULT_KB, use_search: bool = True, use_table_format: bool = False, multi_hop: bool = False, chat_history: List = None):
1325
+ """增强版process_question,支持流式响应,实时显示检索和推理过程,支持多知识库和对话历史"""
1326
+ try:
1327
+ kb_paths = get_kb_paths(kb_name)
1328
+ index_path = kb_paths["index_path"]
1329
+ metadata_path = kb_paths["metadata_path"]
1330
+
1331
+ # 构建带对话历史的问题
1332
+ if chat_history and len(chat_history) > 0:
1333
+ # 构建对话上下文
1334
+ context = "之前的对话内容:\n"
1335
+ for user_msg, assistant_msg in chat_history[-3:]: # 只取最近3轮对话
1336
+ context += f"用户:{user_msg}\n"
1337
+ context += f"助手:{assistant_msg}\n"
1338
+ context += f"\n当前问题:{question}"
1339
+ enhanced_question = f"基于以下对话历史,回答用户的当前问题。\n{context}"
1340
+ else:
1341
+ enhanced_question = question
1342
+
1343
+ # 初始状态
1344
+ search_result = "联网搜索进行中..." if use_search else "未启用联网搜索"
1345
+
1346
+ if multi_hop:
1347
+ reasoning_status = f"正在准备对知识库 '{kb_name}' 进行多跳推理检索..."
1348
+ search_display = f"### 联网搜索结果\n{search_result}\n\n### 推理状态\n{reasoning_status}"
1349
+ yield search_display, "正在启动多跳推理流程..."
1350
+ else:
1351
+ reasoning_status = f"正在准备对知识库 '{kb_name}' 进行向量检索..."
1352
+ search_display = f"### 联网搜索结果\n{search_result}\n\n### 检索状态\n{reasoning_status}"
1353
+ yield search_display, "正在启动简单检索流程..."
1354
+
1355
+ # 如果启用,并行运行搜索
1356
+ search_future = None
1357
+ with ThreadPoolExecutor(max_workers=1) as executor:
1358
+ if use_search:
1359
+ search_future = executor.submit(get_search_background, question)
1360
+
1361
+ # 检查索引是否存在
1362
+ if not (os.path.exists(index_path) and os.path.exists(metadata_path)):
1363
+ # 如果索引不存在,提前返回
1364
+ if search_future:
1365
+ # 等待搜索结果
1366
+ search_result = "等待联网搜索结果..."
1367
+ search_display = f"### 联网搜索结果\n{search_result}\n\n### 检索状态\n知识库 '{kb_name}' 中未找到索引"
1368
+ yield search_display, "等待联网搜索结果..."
1369
+
1370
+ search_result = search_future.result() or "未找到相关网络信息"
1371
+ system_prompt = "你是一名医疗专家。请考虑对话历史并回答用户的问题。"
1372
+ if use_table_format:
1373
+ system_prompt += "请尽可能以Markdown表格的形式呈现结构化信息。"
1374
+ answer = generate_answer_from_deepseek(enhanced_question, system_prompt=system_prompt, background_info=f"[联网搜索结果]:{search_result}")
1375
+
1376
+ search_display = f"### 联网搜索结果\n{search_result}\n\n### 检索状态\n无法在知识库 '{kb_name}' 中进行本地检索(未找到索引)"
1377
+ yield search_display, answer
1378
+ else:
1379
+ yield f"知识库 '{kb_name}' 中未找到索引,且未启用联网搜索", "无法回答您的问题。请先上传文件到该知识库或启用联网搜索。"
1380
+ return
1381
+
1382
+ # 开始流式处理
1383
+ current_answer = "正在分析您的问题..."
1384
+
1385
+ if multi_hop:
1386
+ # 使用多跳推理的流式接口
1387
+ reasoning_rag = ReasoningRAG(
1388
+ index_path=index_path,
1389
+ metadata_path=metadata_path,
1390
+ max_hops=3,
1391
+ initial_candidates=5,
1392
+ refined_candidates=3,
1393
+ verbose=True
1394
+ )
1395
+
1396
+ # 使用enhanced_question进行检索
1397
+ for step_result in reasoning_rag.stream_retrieve_and_answer(enhanced_question, use_table_format):
1398
+ # 更新当前状态
1399
+ status = step_result["status"]
1400
+ reasoning_display = step_result["reasoning_display"]
1401
+
1402
+ # 如果有新的答案,更新
1403
+ if step_result["answer"]:
1404
+ current_answer = step_result["answer"]
1405
+
1406
+ # 如果搜索结果已返回,更新搜索结果
1407
+ if search_future and search_future.done():
1408
+ search_result = search_future.result() or "未找到相关网络信息"
1409
+
1410
+ # 构建并返回当前状态
1411
+ current_display = f"### 联网搜索结果\n{search_result}\n\n### 知识库: {kb_name}\n### 推理状态\n{status}\n\n{reasoning_display}"
1412
+ yield current_display, current_answer
1413
+ else:
1414
+ # 简单向量检索的流式处理
1415
+ yield f"### 联网搜索结果\n{search_result}\n\n### 知识库: {kb_name}\n### 检索状态\n正在执行向量相似度搜索...", "正在检索相关信息..."
1416
+
1417
+ # 执行简单向量搜索,使用enhanced_question
1418
+ try:
1419
+ search_results = vector_search(enhanced_question, index_path, metadata_path, limit=5)
1420
+
1421
+ if not search_results:
1422
+ yield f"### 联网搜索结果\n{search_result}\n\n### 知识库: {kb_name}\n### 检索状态\n未找到相关信息", f"知识库 '{kb_name}' 中未找到相关信息。"
1423
+ current_answer = f"知识库 '{kb_name}' 中未找到相关信息。"
1424
+ else:
1425
+ # 显示检索到的信息
1426
+ chunks_detail = "\n\n".join([f"**相关信息 {i+1}**:\n{result['chunk']}" for i, result in enumerate(search_results[:5])])
1427
+ chunks_preview = "\n".join([f"- {result['chunk'][:100]}..." for i, result in enumerate(search_results[:3])])
1428
+ yield f"### 联网搜索结果\n{search_result}\n\n### 知识库: {kb_name}\n### 检索状态\n找到 {len(search_results)} 个相关信息块\n\n### 检索到的信息预览\n{chunks_preview}", "正在生成答案..."
1429
+
1430
+ # 生成答案
1431
+ background_chunks = "\n\n".join([f"[相关信息 {i+1}]: {result['chunk']}"
1432
+ for i, result in enumerate(search_results)])
1433
+
1434
+ system_prompt = "你是一名医疗专家。基于提供的背景信息和对话历史回答用户的问题。"
1435
+ if use_table_format:
1436
+ system_prompt += "请尽可能以Markdown表格的形式呈现结构化信息。"
1437
+
1438
+ user_prompt = f"""
1439
+ {enhanced_question}
1440
+
1441
+ 背景信息:
1442
+ {background_chunks}
1443
+
1444
+ 请基于以上背景信息和对话历史回答用户的问题。
1445
+ """
1446
+
1447
+ response = client.chat.completions.create(
1448
+ model=Config.llm_model,
1449
+ messages=[
1450
+ {"role": "system", "content": system_prompt},
1451
+ {"role": "user", "content": user_prompt}
1452
+ ]
1453
+ )
1454
+
1455
+ current_answer = response.choices[0].message.content.strip()
1456
+ yield f"### 联网搜索结果\n{search_result}\n\n### 知识库: {kb_name}\n### 检索状态\n检索完成,已生成答案\n\n### 检索到的内容\n{chunks_detail}", current_answer
1457
+
1458
+ except Exception as e:
1459
+ error_msg = f"检索过程中出错: {str(e)}"
1460
+ yield f"### 联网搜索结果\n{search_result}\n\n### 知识库: {kb_name}\n### 检索状态\n{error_msg}", f"检索过程中出错: {str(e)}"
1461
+ current_answer = f"检索过程中出错: {str(e)}"
1462
+
1463
+ # 检索完成后,如果有搜索结果,可以考虑合并知识
1464
+ if search_future and search_future.done():
1465
+ search_result = search_future.result() or "未找到相关网络信息"
1466
+
1467
+ # 如果同时有搜索结果和本地检索结果,可以考虑合并
1468
+ if search_result and current_answer and current_answer not in ["正在分析您的问题...", "本地知识库中未找到相关信息。"]:
1469
+ status_text = "正在合并联网搜索和知识库结果..."
1470
+ if multi_hop:
1471
+ yield f"### 联网搜索结果\n{search_result}\n\n### 知识库: {kb_name}\n### 推理状态\n{status_text}", current_answer
1472
+ else:
1473
+ yield f"### 联网搜索结果\n{search_result}\n\n### 知识库: {kb_name}\n### 检索状态\n{status_text}", current_answer
1474
+
1475
+ # 合并结果
1476
+ system_prompt = "你是一名医疗专家,请整合网络搜索和本地知识库提供全面的解答。请考虑对话历史。"
1477
+
1478
+ if use_table_format:
1479
+ system_prompt += "请尽可能以Markdown表格的形式呈现结构化信息。"
1480
+
1481
+ user_prompt = f"""
1482
+ {enhanced_question}
1483
+
1484
+ 网络搜索结果:{search_result}
1485
+
1486
+ 本地知识库分析:{current_answer}
1487
+
1488
+ 请根据以上信息和对话历史,提供一个综合的回答。确保使用Markdown表格来呈现适合表格形式的信息。
1489
+ """
1490
+
1491
+ try:
1492
+ response = client.chat.completions.create(
1493
+ model="qwen-plus",
1494
+ messages=[
1495
+ {"role": "system", "content": system_prompt},
1496
+ {"role": "user", "content": user_prompt}
1497
+ ]
1498
+ )
1499
+ combined_answer = response.choices[0].message.content.strip()
1500
+
1501
+ final_status = "已整合联网和知识库结果"
1502
+ if multi_hop:
1503
+ final_display = f"### 联网搜索结果\n{search_result}\n\n### 知识库: {kb_name}\n### 本地知识库分析\n已完成多跳推理分析,检索到的内容已在上方显示\n\n### 综合分析\n{final_status}"
1504
+ else:
1505
+ # 获取之前检索到的内容
1506
+ chunks_info = "".join([part.split("### 检索到的内容\n")[-1] if "### 检索到的内容\n" in part else "" for part in search_display.split("### 联网搜索结果")])
1507
+ if not chunks_info.strip():
1508
+ chunks_info = "检索内容已在上方显示"
1509
+ final_display = f"### 联网搜索结果\n{search_result}\n\n### 知识库: {kb_name}\n### 本地知识库分析\n已完成向量检索分析\n\n### 检索到的内容\n{chunks_info}\n\n### 综合分析\n{final_status}"
1510
+
1511
+ yield final_display, combined_answer
1512
+ except Exception as e:
1513
+ # 如果合并失败,使用现有答案
1514
+ error_status = f"合并结果失败: {str(e)}"
1515
+ if multi_hop:
1516
+ final_display = f"### 联网搜索结果\n{search_result}\n\n### 知识库: {kb_name}\n### 本地知识库分析\n已完成多跳推理分析,检索到的内容已在上方显示\n\n### 综合分析\n{error_status}"
1517
+ else:
1518
+ # 获取之前检索到的内容
1519
+ chunks_info = "".join([part.split("### 检索到的内容\n")[-1] if "### 检索到的内容\n" in part else "" for part in search_display.split("### 联网搜索结果")])
1520
+ if not chunks_info.strip():
1521
+ chunks_info = "检索内容已在上方显示"
1522
+ final_display = f"### 联网搜索结果\n{search_result}\n\n### 知识库: {kb_name}\n### 本地知识库分析\n已完成向量检索分析\n\n### 检索到的内容\n{chunks_info}\n\n### 综合分析\n{error_status}"
1523
+
1524
+ yield final_display, current_answer
1525
+
1526
+ except Exception as e:
1527
+ error_msg = f"处理失败:{str(e)}\n{traceback.format_exc()}"
1528
+ yield f"### 错误信息\n{error_msg}", f"处理您的问题时遇到错误:{str(e)}"
1529
+
1530
+ # 添加处理函数,批量上传文件到指定知识库
1531
+ def batch_upload_to_kb(file_objs: List, kb_name: str) -> str:
1532
+ """批量上传文件到指定知识库并进行处理"""
1533
+ try:
1534
+ if not kb_name or not kb_name.strip():
1535
+ return "错误:未指定知识库"
1536
+
1537
+ # 确保知识库目录存在
1538
+ kb_dir = os.path.join(KB_BASE_DIR, kb_name)
1539
+ if not os.path.exists(kb_dir):
1540
+ os.makedirs(kb_dir, exist_ok=True)
1541
+
1542
+ if not file_objs or len(file_objs) == 0:
1543
+ return "错误:未选择任何文件"
1544
+
1545
+ return process_and_index_files(file_objs, kb_name)
1546
+ except Exception as e:
1547
+ return f"上传文件到知识库失败: {str(e)}"
1548
+
1549
+ # Gradio 界面 - 修改为支持多知识库
1550
+ custom_css = """
1551
+ .web-search-toggle .form { display: flex !important; align-items: center !important; }
1552
+ .web-search-toggle .form > label { order: 2 !important; margin-left: 10px !important; }
1553
+ .web-search-toggle .checkbox-wrap { order: 1 !important; background: #d4e4d4 !important; border-radius: 15px !important; padding: 2px !important; width: 50px !important; height: 28px !important; }
1554
+ .web-search-toggle .checkbox-wrap .checkbox-container { width: 24px !important; height: 24px !important; transition: all 0.3s ease !important; }
1555
+ .web-search-toggle input:checked + .checkbox-wrap { background: #2196F3 !important; }
1556
+ .web-search-toggle input:checked + .checkbox-wrap .checkbox-container { transform: translateX(22px) !important; }
1557
+ #search-results { max-height: 400px; overflow-y: auto; border: 1px solid #2196F3; border-radius: 5px; padding: 10px; background-color: #e7f0f9; }
1558
+ #question-input { border-color: #2196F3 !important; }
1559
+ #answer-output { background-color: #f0f7f0; border-color: #2196F3 !important; max-height: 400px; overflow-y: auto; }
1560
+ .submit-btn { background-color: #2196F3 !important; border: none !important; }
1561
+ .reasoning-steps { background-color: #f0f7f0; border: 1px dashed #2196F3; padding: 10px; margin-top: 10px; border-radius: 5px; }
1562
+ .loading-spinner { display: inline-block; width: 20px; height: 20px; border: 3px solid rgba(33, 150, 243, 0.3); border-radius: 50%; border-top-color: #2196F3; animation: spin 1s ease-in-out infinite; }
1563
+ @keyframes spin { to { transform: rotate(360deg); } }
1564
+ .stream-update { animation: fade 0.5s ease-in-out; }
1565
+ @keyframes fade { from { background-color: rgba(33, 150, 243, 0.1); } to { background-color: transparent; } }
1566
+ .status-box { padding: 10px; border-radius: 5px; margin-bottom: 10px; font-weight: bold; }
1567
+ .status-processing { background-color: #e3f2fd; color: #1565c0; border-left: 4px solid #2196F3; }
1568
+ .status-success { background-color: #e8f5e9; color: #2e7d32; border-left: 4px solid #4CAF50; }
1569
+ .status-error { background-color: #ffebee; color: #c62828; border-left: 4px solid #f44336; }
1570
+ .multi-hop-toggle .form { display: flex !important; align-items: center !important; }
1571
+ .multi-hop-toggle .form > label { order: 2 !important; margin-left: 10px !important; }
1572
+ .multi-hop-toggle .checkbox-wrap { order: 1 !important; background: #d4e4d4 !important; border-radius: 15px !important; padding: 2px !important; width: 50px !important; height: 28px !important; }
1573
+ .multi-hop-toggle .checkbox-wrap .checkbox-container { width: 24px !important; height: 24px !important; transition: all 0.3s ease !important; }
1574
+ .multi-hop-toggle input:checked + .checkbox-wrap { background: #4CAF50 !important; }
1575
+ .multi-hop-toggle input:checked + .checkbox-wrap .checkbox-container { transform: translateX(22px) !important; }
1576
+ .kb-management { border: 1px solid #2196F3; border-radius: 5px; padding: 15px; margin-bottom: 15px; background-color: #f0f7ff; }
1577
+ .kb-selector { margin-bottom: 10px; }
1578
+ /* 缩小文件上传区域高度 */
1579
+ .compact-upload {
1580
+ margin-bottom: 10px;
1581
+ }
1582
+
1583
+ .file-upload.compact {
1584
+ padding: 10px; /* 减小内边距 */
1585
+ min-height: 120px; /* 减小最小高度 */
1586
+ margin-bottom: 10px;
1587
+ }
1588
+
1589
+ /* 优化知识库内容显示区域 */
1590
+ .kb-files-list {
1591
+ height: 400px;
1592
+ overflow-y: auto;
1593
+ }
1594
+
1595
+ /* 确保右侧列有足够空间 */
1596
+ #kb-files-group {
1597
+ height: 100%;
1598
+ display: flex;
1599
+ flex-direction: column;
1600
+ }
1601
+ .kb-files-list { max-height: 250px; overflow-y: auto; border: 1px solid #ccc; border-radius: 5px; padding: 10px; margin-top: 10px; background-color: #f9f9f9; }
1602
+ #kb-management-container {
1603
+ max-width: 800px !important;
1604
+ margin: 0 !important; /* 移除自动边距,靠左对齐 */
1605
+ margin-left: 20px !important; /* 添加左边距 */
1606
+ }
1607
+ .container {
1608
+ max-width: 1200px !important;
1609
+ margin: 0 auto !important;
1610
+ }
1611
+ .file-upload {
1612
+ border: 2px dashed #2196F3;
1613
+ padding: 15px;
1614
+ border-radius: 10px;
1615
+ background-color: #f0f7ff;
1616
+ margin-bottom: 15px;
1617
+ }
1618
+ .tabs.tab-selected {
1619
+ background-color: #e3f2fd;
1620
+ border-bottom: 3px solid #2196F3;
1621
+ }
1622
+ .group {
1623
+ border: 1px solid #e0e0e0;
1624
+ border-radius: 8px;
1625
+ padding: 10px;
1626
+ margin-bottom: 15px;
1627
+ background-color: #fafafa;
1628
+ }
1629
+
1630
+ /* 添加更多针对知识库管理页面的样式 */
1631
+ #kb-controls, #kb-file-upload, #kb-files-group {
1632
+ width: 100% !important;
1633
+ max-width: 800px !important;
1634
+ margin-right: auto !important;
1635
+ }
1636
+
1637
+ /* 修改Gradio默认的标签页样式以支持左对齐 */
1638
+ .tabs > .tab-nav > button {
1639
+ flex: 0 1 auto !important; /* 修改为不自动扩展,只占用必要空间 */
1640
+ }
1641
+ .tabs > .tabitem {
1642
+ padding-left: 0 !important; /* 移除左边距,使内容靠左 */
1643
+ }
1644
+ /* 对于首页的顶部标题部分 */
1645
+ #app-container h1, #app-container h2, #app-container h3,
1646
+ #app-container > .prose {
1647
+ text-align: left !important;
1648
+ padding-left: 20px !important;
1649
+ }
1650
+ """
1651
+
1652
+ custom_theme = gr.themes.Soft(
1653
+ primary_hue="blue",
1654
+ secondary_hue="blue",
1655
+ neutral_hue="gray",
1656
+ text_size="lg",
1657
+ spacing_size="md",
1658
+ radius_size="md"
1659
+ )
1660
+
1661
+ # 添加简单的JavaScript,通过html组件实现
1662
+ js_code = """
1663
+ <script>
1664
+ document.addEventListener('DOMContentLoaded', function() {
1665
+ // 当页面加载完毕后,找到提交按钮,并为其添加点击事件
1666
+ const observer = new MutationObserver(function(mutations) {
1667
+ // 找到提交按钮
1668
+ const submitButton = document.querySelector('button[data-testid="submit"]');
1669
+ if (submitButton) {
1670
+ submitButton.addEventListener('click', function() {
1671
+ // 找到检索标签页按钮并点击它
1672
+ setTimeout(function() {
1673
+ const retrievalTab = document.querySelector('[data-testid="tab-button-retrieval-tab"]');
1674
+ if (retrievalTab) retrievalTab.click();
1675
+ }, 100);
1676
+ });
1677
+ observer.disconnect(); // 一旦找到并设置事件,停止观察
1678
+ }
1679
+ });
1680
+
1681
+ // 开始观察文档变化
1682
+ observer.observe(document.body, { childList: true, subtree: true });
1683
+ });
1684
+ </script>
1685
+ """
1686
+
1687
+ with gr.Blocks(title="医疗知识问答系统", theme=custom_theme, css=custom_css, elem_id="app-container") as demo:
1688
+ with gr.Column(elem_id="header-container"):
1689
+ gr.Markdown("""
1690
+ # 🏥 医疗知识问答系统
1691
+ **智能医疗助手,支持多知识库管理、多轮对话、普通语义检索和高级多跳推理**
1692
+ 本系统支持创建多个知识库,上传TXT或PDF文件,通过语义向量检索或创新的多跳推理机制提供医疗信息查询服务。
1693
+ """)
1694
+
1695
+ # 添加JavaScript脚本
1696
+ gr.HTML(js_code, visible=False)
1697
+
1698
+ # 使用State来存储对话历史
1699
+ chat_history_state = gr.State([])
1700
+
1701
+ # 创建标签页
1702
+ with gr.Tabs() as tabs:
1703
+ # 知识库管理标签页
1704
+ with gr.TabItem("知识库管理"):
1705
+ with gr.Row():
1706
+ # 左侧列:控制区
1707
+ with gr.Column(scale=1, min_width=400):
1708
+ gr.Markdown("### 📚 知识库管理与构建")
1709
+
1710
+ with gr.Row(elem_id="kb-controls"):
1711
+ with gr.Column(scale=1):
1712
+ new_kb_name = gr.Textbox(
1713
+ label="新知识库名称",
1714
+ placeholder="输入新知识库名称",
1715
+ lines=1
1716
+ )
1717
+ create_kb_btn = gr.Button("创建知识库", variant="primary", scale=1)
1718
+
1719
+ with gr.Column(scale=1):
1720
+ current_kbs = get_knowledge_bases()
1721
+ kb_dropdown = gr.Dropdown(
1722
+ label="选择知识库",
1723
+ choices=current_kbs,
1724
+ value=DEFAULT_KB if DEFAULT_KB in current_kbs else (current_kbs[0] if current_kbs else None),
1725
+ elem_classes="kb-selector"
1726
+ )
1727
+
1728
+ with gr.Row():
1729
+ refresh_kb_btn = gr.Button("刷新列表", size="sm", scale=1)
1730
+ delete_kb_btn = gr.Button("删除知识库", size="sm", variant="stop", scale=1)
1731
+
1732
+ kb_status = gr.Textbox(label="知识库状态", interactive=False, placeholder="选择或创建知识库")
1733
+
1734
+ with gr.Group(elem_id="kb-file-upload", elem_classes="compact-upload"):
1735
+ gr.Markdown("### 📄 上传文件到知识库")
1736
+ file_upload = gr.File(
1737
+ label="选择文件(支持多选TXT/PDF)",
1738
+ type="filepath",
1739
+ file_types=[".txt", ".pdf"],
1740
+ file_count="multiple",
1741
+ elem_classes="file-upload compact"
1742
+ )
1743
+ upload_status = gr.Textbox(label="上传状态", interactive=False, placeholder="上传后显示状态")
1744
+
1745
+ kb_select_for_chat = gr.Dropdown(
1746
+ label="为对话选择知识库",
1747
+ choices=current_kbs,
1748
+ value=DEFAULT_KB if DEFAULT_KB in current_kbs else (current_kbs[0] if current_kbs else None),
1749
+ visible=False # 隐藏,仅用于同步
1750
+ )
1751
+
1752
+ with gr.Column(scale=1, min_width=400):
1753
+ with gr.Group(elem_id="kb-files-group"):
1754
+ gr.Markdown("### 📋 知识库内容")
1755
+ kb_files_list = gr.Markdown(
1756
+ value="选择知识库查看文件...",
1757
+ elem_classes="kb-files-list"
1758
+ )
1759
+
1760
+ # 用于对话界面的知识库选择器
1761
+ kb_select_for_chat = gr.Dropdown(
1762
+ label="为对话选择知识库",
1763
+ choices=current_kbs,
1764
+ value=DEFAULT_KB if DEFAULT_KB in current_kbs else (current_kbs[0] if current_kbs else None),
1765
+ visible=False # 隐藏,仅用于同步
1766
+ )
1767
+
1768
+ # 对话交互标签页
1769
+ with gr.TabItem("对话交互"):
1770
+ with gr.Row():
1771
+ with gr.Column(scale=1):
1772
+ gr.Markdown("### ⚙️ 对话设置")
1773
+
1774
+ kb_dropdown_chat = gr.Dropdown(
1775
+ label="选择知识库进行对话",
1776
+ choices=current_kbs,
1777
+ value=DEFAULT_KB if DEFAULT_KB in current_kbs else (current_kbs[0] if current_kbs else None),
1778
+ )
1779
+
1780
+ with gr.Row():
1781
+ web_search_toggle = gr.Checkbox(
1782
+ label="🌐 启用联网搜索",
1783
+ value=True,
1784
+ info="获取最新医疗动态",
1785
+ elem_classes="web-search-toggle"
1786
+ )
1787
+ table_format_toggle = gr.Checkbox(
1788
+ label="📊 表格格式输出",
1789
+ value=True,
1790
+ info="使用Markdown表格展示结构化回答",
1791
+ elem_classes="web-search-toggle"
1792
+ )
1793
+
1794
+ multi_hop_toggle = gr.Checkbox(
1795
+ label="🔄 启用多跳推理",
1796
+ value=False,
1797
+ info="使用高级多跳推理机制(较慢但更全面)",
1798
+ elem_classes="multi-hop-toggle"
1799
+ )
1800
+
1801
+ with gr.Accordion("显示检索进展", open=False):
1802
+ search_results_output = gr.Markdown(
1803
+ label="检索过程",
1804
+ elem_id="search-results",
1805
+ value="等待提交问题..."
1806
+ )
1807
+
1808
+ with gr.Column(scale=3):
1809
+ gr.Markdown("### 💬 对话历史")
1810
+ chatbot = gr.Chatbot(
1811
+ elem_id="chatbot",
1812
+ label="对话历史",
1813
+ height=550
1814
+ )
1815
+
1816
+ with gr.Row():
1817
+ question_input = gr.Textbox(
1818
+ label="输入医疗健康相关问题",
1819
+ placeholder="例如:2型糖尿病的症状和治疗方法有哪些?",
1820
+ lines=2,
1821
+ elem_id="question-input"
1822
+ )
1823
+
1824
+ with gr.Row(elem_classes="submit-row"):
1825
+ submit_btn = gr.Button("提交问题", variant="primary", elem_classes="submit-btn")
1826
+ clear_btn = gr.Button("清空输入", variant="secondary")
1827
+ clear_history_btn = gr.Button("清空对话历史", variant="secondary", elem_classes="clear-history-btn")
1828
+
1829
+ # 状态显示框
1830
+ status_box = gr.HTML(
1831
+ value='<div class="status-box status-processing">准备就绪,等待您的问题</div>',
1832
+ visible=True
1833
+ )
1834
+
1835
+ gr.Examples(
1836
+ examples=[
1837
+ ["2型糖尿病的症状和治疗方法有哪些?"],
1838
+ ["高血压患者的日常饮食应该注意什么?"],
1839
+ ["肺癌的早期症状和筛查方法是什么?"],
1840
+ ["新冠肺炎后遗症有哪些表现?如何缓解?"],
1841
+ ["儿童过敏性鼻炎的诊断标准和治疗方案有哪些?"]
1842
+ ],
1843
+ inputs=question_input,
1844
+ label="示例问题(点击尝试)"
1845
+ )
1846
+
1847
+ # 创建知识库函数
1848
+ def create_kb_and_refresh(kb_name):
1849
+ result = create_knowledge_base(kb_name)
1850
+ kbs = get_knowledge_bases()
1851
+ # 更新两个下拉菜单
1852
+ return result, gr.update(choices=kbs, value=kb_name if "创建成功" in result else None), gr.update(choices=kbs, value=kb_name if "创建成功" in result else None)
1853
+
1854
+ # 刷新知识库列表
1855
+ def refresh_kb_list():
1856
+ kbs = get_knowledge_bases()
1857
+ # 更新两个下拉菜单
1858
+ return gr.update(choices=kbs, value=kbs[0] if kbs else None), gr.update(choices=kbs, value=kbs[0] if kbs else None)
1859
+
1860
+ # 删除知识库
1861
+ def delete_kb_and_refresh(kb_name):
1862
+ result = delete_knowledge_base(kb_name)
1863
+ kbs = get_knowledge_bases()
1864
+ # 更新两个下拉菜单
1865
+ return result, gr.update(choices=kbs, value=kbs[0] if kbs else None), gr.update(choices=kbs, value=kbs[0] if kbs else None)
1866
+
1867
+ # 更新知识库文件列表
1868
+ def update_kb_files_list(kb_name):
1869
+ if not kb_name:
1870
+ return "未选择知识库"
1871
+
1872
+ files = get_kb_files(kb_name)
1873
+ kb_dir = os.path.join(KB_BASE_DIR, kb_name)
1874
+ has_index = os.path.exists(os.path.join(kb_dir, "semantic_chunk.index"))
1875
+
1876
+ if not files:
1877
+ files_str = "知识库中暂无文件"
1878
+ else:
1879
+ files_str = "**文件列表:**\n\n" + "\n".join([f"- {file}" for file in files])
1880
+
1881
+ index_status = "\n\n**索引状态:** " + ("✅ 已建立索引" if has_index else "❌ 未建立索引")
1882
+
1883
+ return f"### 知识库: {kb_name}\n\n{files_str}{index_status}"
1884
+
1885
+ # 同步知识库选择 - 管理界面到对话界面
1886
+ def sync_kb_to_chat(kb_name):
1887
+ return gr.update(value=kb_name)
1888
+
1889
+ # 同步知识库选择 - 对话界面到管理界面
1890
+ def sync_chat_to_kb(kb_name):
1891
+ return gr.update(value=kb_name), update_kb_files_list(kb_name)
1892
+
1893
+ # 处理文件上传到指定知识库
1894
+ def process_upload_to_kb(files, kb_name):
1895
+ if not kb_name:
1896
+ return "错误:未选择知识库"
1897
+
1898
+ result = batch_upload_to_kb(files, kb_name)
1899
+ # 更新知识库文件列表
1900
+ files_list = update_kb_files_list(kb_name)
1901
+ return result, files_list
1902
+
1903
+ # 知识库选择变化时
1904
+ def on_kb_change(kb_name):
1905
+ if not kb_name:
1906
+ return "未选择知识库", "选择知识库查看文件..."
1907
+
1908
+ kb_dir = os.path.join(KB_BASE_DIR, kb_name)
1909
+ has_index = os.path.exists(os.path.join(kb_dir, "semantic_chunk.index"))
1910
+ status = f"已选择知识库: {kb_name}" + (" (已建立索引)" if has_index else " (未建立索引)")
1911
+
1912
+ # 更新文件列表
1913
+ files_list = update_kb_files_list(kb_name)
1914
+
1915
+ return status, files_list
1916
+
1917
+ # 创建知识库按钮功能
1918
+ create_kb_btn.click(
1919
+ fn=create_kb_and_refresh,
1920
+ inputs=[new_kb_name],
1921
+ outputs=[kb_status, kb_dropdown, kb_dropdown_chat]
1922
+ ).then(
1923
+ fn=lambda: "", # 清空输入框
1924
+ inputs=[],
1925
+ outputs=[new_kb_name]
1926
+ )
1927
+
1928
+ # 刷新知识库列表按钮功能
1929
+ refresh_kb_btn.click(
1930
+ fn=refresh_kb_list,
1931
+ inputs=[],
1932
+ outputs=[kb_dropdown, kb_dropdown_chat]
1933
+ )
1934
+
1935
+ # 删除知识库按钮功能
1936
+ delete_kb_btn.click(
1937
+ fn=delete_kb_and_refresh,
1938
+ inputs=[kb_dropdown],
1939
+ outputs=[kb_status, kb_dropdown, kb_dropdown_chat]
1940
+ ).then(
1941
+ fn=update_kb_files_list,
1942
+ inputs=[kb_dropdown],
1943
+ outputs=[kb_files_list]
1944
+ )
1945
+
1946
+ # 知识库选择变化时 - 管理界面
1947
+ kb_dropdown.change(
1948
+ fn=on_kb_change,
1949
+ inputs=[kb_dropdown],
1950
+ outputs=[kb_status, kb_files_list]
1951
+ ).then(
1952
+ fn=sync_kb_to_chat,
1953
+ inputs=[kb_dropdown],
1954
+ outputs=[kb_dropdown_chat]
1955
+ )
1956
+
1957
+ # 知识库选择变化时 - 对话界面
1958
+ kb_dropdown_chat.change(
1959
+ fn=sync_chat_to_kb,
1960
+ inputs=[kb_dropdown_chat],
1961
+ outputs=[kb_dropdown, kb_files_list]
1962
+ )
1963
+
1964
+ # 处理文件上传
1965
+ file_upload.upload(
1966
+ fn=process_upload_to_kb,
1967
+ inputs=[file_upload, kb_dropdown],
1968
+ outputs=[upload_status, kb_files_list]
1969
+ )
1970
+
1971
+ # 清空输入按钮功能
1972
+ clear_btn.click(
1973
+ fn=lambda: "",
1974
+ inputs=[],
1975
+ outputs=[question_input]
1976
+ )
1977
+
1978
+ # 清空对话历史按钮功能
1979
+ def clear_history():
1980
+ return [], []
1981
+
1982
+ clear_history_btn.click(
1983
+ fn=clear_history,
1984
+ inputs=[],
1985
+ outputs=[chatbot, chat_history_state]
1986
+ )
1987
+
1988
+ # 提交按钮 - 开始流式处理
1989
+ def update_status(is_processing=True, is_error=False):
1990
+ if is_processing:
1991
+ return '<div class="status-box status-processing">正在处理您的问题...</div>'
1992
+ elif is_error:
1993
+ return '<div class="status-box status-error">处理过程中出现错误</div>'
1994
+ else:
1995
+ return '<div class="status-box status-success">回答已生成完毕</div>'
1996
+
1997
+ # 处理问题并更新对话历史
1998
+ def process_and_update_chat(question, kb_name, use_search, use_table_format, multi_hop, chat_history):
1999
+ if not question.strip():
2000
+ return chat_history, update_status(False, True), "等待提交问题..."
2001
+
2002
+ try:
2003
+ # 首先更新聊天界面,显示用户问题
2004
+ chat_history.append([question, "正在思考..."])
2005
+ yield chat_history, update_status(True), f"开始处理您的问题,使用知识库: {kb_name}..."
2006
+
2007
+ # 用于累积检索状态和答案
2008
+ last_search_display = ""
2009
+ last_answer = ""
2010
+
2011
+ # 使用生成器进行流式处理
2012
+ for search_display, answer in process_question_with_reasoning(question, kb_name, use_search, use_table_format, multi_hop, chat_history[:-1]):
2013
+ # 更新检索状态和答案
2014
+ last_search_display = search_display
2015
+ last_answer = answer
2016
+
2017
+ # 更新聊天历史中的最后一条(当前的回答)
2018
+ if chat_history:
2019
+ chat_history[-1][1] = answer
2020
+ yield chat_history, update_status(True), search_display
2021
+
2022
+ # 处理完成,更新状态
2023
+ yield chat_history, update_status(False), last_search_display
2024
+
2025
+ except Exception as e:
2026
+ # 发生错误时更新状态和聊天历史
2027
+ error_msg = f"处理问题时出错: {str(e)}"
2028
+ if chat_history:
2029
+ chat_history[-1][1] = error_msg
2030
+ yield chat_history, update_status(False, True), f"### 错误\n{error_msg}"
2031
+
2032
+ # 连接提交按钮
2033
+ submit_btn.click(
2034
+ fn=process_and_update_chat,
2035
+ inputs=[question_input, kb_dropdown_chat, web_search_toggle, table_format_toggle, multi_hop_toggle, chat_history_state],
2036
+ outputs=[chatbot, status_box, search_results_output],
2037
+ queue=True
2038
+ ).then(
2039
+ fn=lambda: "", # 清空输入框
2040
+ inputs=[],
2041
+ outputs=[question_input]
2042
+ ).then(
2043
+ fn=lambda h: h, # 更新state
2044
+ inputs=[chatbot],
2045
+ outputs=[chat_history_state]
2046
+ )
2047
+
2048
+ # 支持Enter键提交
2049
+ question_input.submit(
2050
+ fn=process_and_update_chat,
2051
+ inputs=[question_input, kb_dropdown_chat, web_search_toggle, table_format_toggle, multi_hop_toggle, chat_history_state],
2052
+ outputs=[chatbot, status_box, search_results_output],
2053
+ queue=True
2054
+ ).then(
2055
+ fn=lambda: "", # 清空输入框
2056
+ inputs=[],
2057
+ outputs=[question_input]
2058
+ ).then(
2059
+ fn=lambda h: h, # 更新state
2060
+ inputs=[chatbot],
2061
+ outputs=[chat_history_state]
2062
+ )
2063
+
2064
+ if __name__ == "__main__":
2065
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ protobuf
2
+ cpm_kernels
3
+ torch>=2.0
4
+ jieba==0.42.1
5
+ lxml==4.9.4
6
+ gradio
7
+ mdtex2html
8
+ sentencepiece
9
+ sse-starlette
10
+ streamlit>=1.24.0
11
+ pyfunctional==1.3.0
12
+ chardet
13
+ faiss-cpu
14
+ PyMuPDF
15
+ openai
16
+ llama-index
retrievor.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import re
4
+ from lxml import etree
5
+ import chardet
6
+ import jieba.analyse
7
+ from text2vec import *
8
+ from config import Config
9
+
10
+ def search_bing(query):
11
+ """利用newbing搜索接口,用于检索与query相关的背景信息,作为检索内容
12
+ input:query
13
+ output:{'url':'','text':'','title':''}
14
+ """
15
+ headers = {
16
+ 'Cookie': 'MUID=2CFCFC26663D64393955ED1C623D62A4; MUIDB=2CFCFC26663D64393955ED1C623D62A4; SRCHD=AF=S00028; SRCHUID=V=2&GUID=76DC1CA8309043BBAB81CFC4C47D76DD&dmnchg=1; _UR=QS=0&TQS=0; MicrosoftApplicationsTelemetryDeviceId=64c1979f-ee59-40a7-928e-b755865bc6ae; ABDEF=V=13&ABDV=13&MRNB=1696643925014&MRB=0; ANON=A=15BC3EC2F3AC041DAD2C715CFFFFFFFF&E=1d05&W=2; NAP=V=1.9&E=1cab&C=MnJiRko1YRJfqV6H22giKijH0-4G1Ub50-Cg7gnMPMN4QFF_OeDZsQ&W=2; PPLState=1; _HPVN=CS=eyJQbiI6eyJDbiI6NiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiUCJ9LCJTYyI6eyJDbiI6NiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiSCJ9LCJReiI6eyJDbiI6NiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiVCJ9LCJBcCI6dHJ1ZSwiTXV0ZSI6dHJ1ZSwiTGFkIjoiMjAyMy0xMi0xMVQwMDowMDowMFoiLCJJb3RkIjowLCJHd2IiOjAsIlRucyI6MCwiRGZ0IjpudWxsLCJNdnMiOjAsIkZsdCI6MCwiSW1wIjoxNiwiVG9iYnMiOjB9; _EDGE_S=SID=2E9C7DC6F71A6094195D6E28F6C8614B; USRLOC=HS=1&ELOC=LAT=30.26828956604004|LON=120.13351440429688|N=%E8%A5%BF%E6%B9%96%E5%8C%BA%EF%BC%8C%E6%B5%99%E6%B1%9F%E7%9C%81|ELT=4|; _Rwho=u=d; _SS=SID=2E9C7DC6F71A6094195D6E28F6C8614B&R=64&RB=64&GB=0&RG=0&RP=61; KievRPSSecAuth=FABKBBRaTOJILtFsMkpLVWSG6AN6C/svRwNmAAAEgAAACFAPF0afImbrCASVm1xT1K+FiXTsQgaoq6RydL+Ge3FvFrIbHVbXa7m0OlZNQJT4P62pu6xUtDTqwGPUE13tWBwVPkK1RahHVaGuUSLfwbp5o2HeLnKa+hfc6+sJiYHnxklhiJAzdi/oBbiWdDkf+5A+C0Fbsxeo4pQDt+kmeKhWpMwijA0bVP5ISXdkrLsRv5jiq97srkAMWFHqqGboI70LdX7ahqSSiykzwFdA1np3WhYhURWQ4b3z6uV7nsZpth6lpdafGZ2YLWr0Zwpv1D210P04ovzbbzvuKGoeljS4/SvdX8QUoGONzn0f2OXAOPvsnZJctbwxH/tkembDlpN4liJDCYhlYgoKtg5nuLBNihk75VctLodAQhosDNYM9stJRzQlusK+aEbDQKAgXunPwB0iPq0ECEVmLIApOeXs7DEtj29Q8zuWiOmxXnddGDm4Tf0VWUVjAEfP/PKiiTLAAS/dwPgOslgEdpy3Pw6GQYo3z3dZ16mWuXYX53utgdkK4rtqRj/FmYiTRjL6scm7Ds0UJnVNxdJcFACadTOzNVEGBp2XIb6XEAWZThz21+JJCn325RXG+zwJyjaKI941n6CbQ8Z/dXgUYMBsn/gfdGV3/+vz05pIOtB1zmzkvwds5v4M/zTcf5fgqWwLjSbIBFscYA626llQwDS6LkKwyoB/EB3L0XgLnOFpoSSpk41L/q5e0GkLVxzPA5kZue0iLTNEXUu/KCGnPOBkK0iAZVS/bJPVa3ZBPBOODwXnAUR0s0W1hbHLDW4I1ZrMuujx08DU0/nhhiq0mFgwwxHrd4vE9xdecjlpyL78pzPf5LVAiCKaWZ/BnKqHCYHA4hiEg8ffC5eFwoA6JsL0wtvTSdaAPEcUs103Um9eje8nNKwvDtesqh93lOAbNCfkfC/zAdtsR0dWaZIsYdAeMNQE+6//zLDbGIe24WVsSdiwZqdmYI2ICxE+KqPY++Ei4gfgKt0GNyiAfK0qSfALb01POx95rWOyWSPd0ge6DwM5mHAZfTePR44vBfFdhvUYBg0+47nOzY53hcO/6unDb3u1/PLHM7+rlS+76yjrZ9Dl7cFXRNBINy5afDUY+kU9JQS6QTbc5EIQTynlWkGU38m43GtWXquitzrjHuC0mYsUbLQOuZ1kFWHQXF/4I/aaxU1k0uvEOttkIUkhXi5lKo9uLoPGdha+AIGcDz8icpdDnfAHHpChm0YB8K8lcL0foY6NCib+o+LCLfriZg9Nvtkc8s1+TWPvCvHZX4bZuXyN4tHoQiysRd6j0gyJpLR4yQr5iOyBUgkM9WWKzkFmnzVYlb4ec6wpowsw2643AHs5Ge1FDjzKw3TdSVnwB2dHFh7tdNW1ywYDAGhpv8SSvQ66448UANVqB1uKwxsD0mXJR/tjMy9OuiNR; _U=1S7ve-XVb_pOh5Iig5kQlQDI6wv9BNl9HiCEtz0dS6dNV_UrQUBmAFVEZx7pYNRTwRxGG8eASH_IDUlpJu04SCp8aeYlPHkU_-0xGzlVA3nTqaE9kSUyIm1UVQYovjbOrsh4SeBbU-wrjqz6HV2DeUKJiHyTwYlDeQ8bYboyqhB4-ER5PjMGcp8daGbur9ER2KSm-nJOeUqnWeIawk0BVyw; WLS=C=26d7831f7a57e7fd&N=; SRCHUSR=DOB=20220207&T=1703137884000&TPC=1703137888000&POEX=W; SRCHHPGUSR=SRCHLANG=zh-Hans&BZA=0&BRW=HTP&BRH=M&CW=975&CH=931&SW=1920&SH=1080&DPR=1&UTC=480&DM=0&HV=1703137888&WTS=63838734684&PRVCW=975&PRVCH=931&SCW=1164&SCH=2821&PV=10.0.0; WLID=mA9cZwKIoBbQ9h07pbmfeYJEn7iBxd5sk7A9mKFJf1dP4SWmtri4X9d1xcl06hKEVmEpT+5GB21NeHYv/uk3maNbHalTEB+UwCwfS7RdzoQ=; _RwBf=r=0&ilt=1&ihpd=1&ispd=0&rc=64&rb=64&gb=0&rg=0&pc=61&mtu=0&rbb=0.0&g=0&cid=&clo=0&v=1&l=2023-12-20T08:00:00.0000000Z&lft=0001-01-01T00:00:00.0000000&aof=0&o=0&p=bingcopilotwaitlist&c=MY00IA&t=3001&s=2023-03-20T09:14:27.6545941+00:00&ts=2023-12-21T05:51:26.7082437+00:00&rwred=0&wls=2&wlb=0&lka=0&lkt=0&aad=0&TH=&mta=0&e=CS-LRz6MT6YjyZDqHmn2zXGq0iVnD2Plg7iI7uA3t-iwF4TTPdW2rejPh5N_c6syhuNr1-uNgqm8vKVLqjaaig&A=15BC3EC2F3AC041DAD2C715CFFFFFFFF&wle=1&ccp=0&ard=0001-01-01T00:00:00.0000000; ipv6=hit=1703141490169&t=4',
17
+ 'Accept-Encoding': 'gzip, deflate',
18
+ 'Accept-Language': 'zh-CN,zh;q=0.9',
19
+ 'Accept': '*/*',
20
+ 'Referer': 'https://cn.bing.com/search?',
21
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
22
+ }
23
+
24
+ res = []
25
+ url = 'https://cn.bing.com/search?q=' + query + '&qs=n&form=QBRE'
26
+ r = requests.get(url, headers=headers)
27
+ try:
28
+ encoding = chardet.detect(r.content)['encoding']
29
+ r.encoding = encoding
30
+ dom = etree.HTML(r.content.decode(encoding))
31
+ except:
32
+ dom = etree.HTML(r.content)
33
+
34
+ url_list = []
35
+ tmp_url = []
36
+ #只采集列表的第一页
37
+ for sel in dom.xpath('//ol[@id="b_results"]/li/h2'):
38
+ l = ''.join(sel.xpath('a/@href'))
39
+ title = ''.join(sel.xpath('a//text()')).split('-')[0].strip()
40
+ if 'http' in l and l not in tmp_url and 'doc.' not in l:
41
+ url_list.append([l,title])
42
+ tmp_url.append(l)
43
+ for turl,title in url_list:
44
+ try:
45
+ tr = requests.get(turl, headers=headers, timeout=(5, 5))
46
+ tdom = etree.HTML(tr.content.decode('utf-8'))
47
+ text = '\n'.join(tdom.xpath('//p/text()'))
48
+ if len(text) > 15:
49
+ tmp = {}
50
+ tmp['url'] = turl
51
+ tmp['text'] = text
52
+ tmp['title'] = title
53
+ res.append(tmp)
54
+ except Exception as e:
55
+ print(e)
56
+ pass
57
+ return res
58
+
59
+
60
+
61
+
62
+ class TextRecallRank():
63
+ """
64
+ 实现对检索内容的召回与排序
65
+ """
66
+
67
+ def __init__(self,cfg):
68
+ self.topk = cfg.topk #query关键词召回的数量
69
+ self.topd = cfg.topd #召回文章的数量
70
+ self.topt = cfg.topt #召回文本片段的数量
71
+ self.maxlen = cfg.maxlen #召回文本片段的长度
72
+ self.recall_way = cfg.recall_way #召回方式
73
+
74
+
75
+
76
+ def query_analyze(self,query):
77
+ """query的解析,目前利用jieba进行关键词提取
78
+ input:query,topk
79
+ output:
80
+ keywords:{'word':[]}
81
+ total_weight: float number
82
+ """
83
+ keywords = jieba.analyse.extract_tags(query, topK=self.topk, withWeight=True)
84
+ total_weight = self.topk / sum([r[1] for r in keywords])
85
+ return keywords,total_weight
86
+
87
+ def text_segmentate(self, text, maxlen, seps='\n', strips=None):
88
+ """将文本按照标点符号划分为若干个短句
89
+ """
90
+ text = text.strip().strip(strips)
91
+ if seps and len(text) > maxlen:
92
+ pieces = text.split(seps[0])
93
+ text, texts = '', []
94
+ for i, p in enumerate(pieces):
95
+ if text and p and len(text) + len(p) > maxlen - 1:
96
+ texts.extend(self.text_segmentate(text, maxlen, seps[1:], strips))
97
+ text = ''
98
+ if i + 1 == len(pieces):
99
+ text = text + p
100
+ else:
101
+ text = text + p + seps[0]
102
+ if text:
103
+ texts.extend(self.text_segmentate(text, maxlen, seps[1:], strips))
104
+ return texts
105
+ else:
106
+ return [text]
107
+
108
+ def recall_title_score(self,title,keywords,total_weight):
109
+ """计算query与标题的匹配度"""
110
+ score = 0
111
+ for item in keywords:
112
+ kw, weight = item
113
+ if kw in title:
114
+ score += round(weight * total_weight,4)
115
+ return score
116
+
117
+ def recall_text_score(self, text, keywords, total_weight):
118
+ """计算query与text的匹配程度"""
119
+ score = 0
120
+ for item in keywords:
121
+ kw, weight = item
122
+ p11 = re.compile('%s' % kw)
123
+ pr = p11.findall(text)
124
+ # score += round(weight * total_weight, 4) * len(pr)
125
+ score += round(weight * total_weight, 4)
126
+ return score
127
+
128
+ def rank_text_by_keywords(self,query,data):
129
+ """通过关键词进行召回"""
130
+
131
+ #query分析
132
+ keywords,total_weight = self.query_analyze(query)
133
+
134
+ #先召回title
135
+ title_score = {}
136
+ for line in data:
137
+ title = line['title']
138
+ title_score[title] = self.recall_title_score(title,keywords,total_weight)
139
+ title_score = sorted(title_score.items(),key=lambda x:x[1],reverse=True)
140
+ # print(title_score)
141
+ recall_title_list = [t[0] for t in title_score[:self.topd]]
142
+
143
+ #召回sentence
144
+ sentence_score = {}
145
+ for line in data:
146
+ title = line['title']
147
+ text = line['text']
148
+ if title in recall_title_list:
149
+ for ct in self.text_segmentate(text,self.maxlen, seps='\n。'):
150
+ ct = re.sub('\s+', ' ', ct)
151
+ if len(ct)>=20:
152
+ sentence_score[ct] = self.recall_text_score(ct,keywords,total_weight)
153
+
154
+ sentence_score = sorted(sentence_score.items(),key=lambda x:x[1],reverse=True)
155
+ recall_sentence_list = [s[0] for s in sentence_score[:self.topt]]
156
+ return '\n'.join(recall_sentence_list)
157
+
158
+ def rank_text_by_text2vec(self, query, data):
159
+ """通过text2vec召回"""
160
+ if not data:
161
+ print("Warning: No data provided for ranking")
162
+ return ""
163
+
164
+ # 先召回title
165
+ title_list = [query]
166
+ for line in data:
167
+ title = line['title']
168
+ title_list.append(title)
169
+
170
+ # 确保至少有两个标题,否则无法进行相似度计算
171
+ if len(title_list) <= 1:
172
+ print("Warning: Not enough titles for similarity calculation")
173
+ return ""
174
+
175
+ title_vectors = get_vector(title_list, 8)
176
+
177
+ # 检查向量化是否成功
178
+ if title_vectors.numel() == 0 or title_vectors.size(0) <= 1:
179
+ print("Warning: Title vectorization failed or returned insufficient vectors")
180
+ return ""
181
+
182
+ title_score = get_sim(title_vectors)
183
+
184
+ # 检查相似度计算是否成功
185
+ if not title_score:
186
+ print("Warning: Title similarity calculation failed")
187
+ return ""
188
+
189
+ title_score = dict(zip(title_score, range(1, len(title_list))))
190
+ title_score = sorted(title_score.items(), key=lambda x:x[0], reverse=True)
191
+
192
+ # 确保有足够的标题用于召回
193
+ if not title_score or self.topd <= 0:
194
+ print("Warning: No title scores or invalid topd parameter")
195
+ return ""
196
+
197
+ recall_title_list = [title_list[t[1]] for t in title_score[:min(self.topd, len(title_score))]]
198
+
199
+ # 召回sentence
200
+ sentence_list = [query]
201
+ for line in data:
202
+ title = line['title']
203
+ text = line['text']
204
+ if title in recall_title_list:
205
+ for ct in self.text_segmentate(text, self.maxlen, seps='\n。'):
206
+ ct = re.sub('\s+', ' ', ct)
207
+ if len(ct) >= 20:
208
+ sentence_list.append(ct)
209
+
210
+ # 确保至少有两个句子,否则无法进行相似度计算
211
+ if len(sentence_list) <= 1:
212
+ print("Warning: Not enough sentences for similarity calculation")
213
+ return ""
214
+
215
+ sentence_vectors = get_vector(sentence_list, 8)
216
+
217
+ # 检查向量化是否成功
218
+ if sentence_vectors.numel() == 0 or sentence_vectors.size(0) <= 1:
219
+ print("Warning: Sentence vectorization failed or returned insufficient vectors")
220
+ return ""
221
+
222
+ sentence_score = get_sim(sentence_vectors)
223
+
224
+ # 检查相似度计算是否成功
225
+ if not sentence_score:
226
+ print("Warning: Sentence similarity calculation failed")
227
+ return ""
228
+
229
+ sentence_score = dict(zip(sentence_score, range(1, len(sentence_list))))
230
+ sentence_score = sorted(sentence_score.items(), key=lambda x:x[0], reverse=True)
231
+
232
+ # 确保有足够的句子用于召回
233
+ if not sentence_score or self.topt <= 0:
234
+ print("Warning: No sentence scores or invalid topt parameter")
235
+ return ""
236
+
237
+ recall_sentence_list = [sentence_list[s[1]] for s in sentence_score[:min(self.topt, len(sentence_score))]]
238
+ return '\n'.join(recall_sentence_list)
239
+
240
+
241
+ def query_retrieve(self,query):
242
+ #利用搜索引擎获取相关信息
243
+ data = search_bing(query)
244
+ #对获取的相关信息进行召回与排序,得到背景信息
245
+ if self.recall_way == 'keyword':
246
+ bg_text = self.rank_text_by_keywords(query,data)
247
+ else:
248
+ bg_text = self.rank_text_by_text2vec(query,data)
249
+ return bg_text
250
+
251
+
252
+ cfg = Config()
253
+ trr = TextRecallRank(cfg)
254
+ q_searching = trr.query_retrieve
text2vec.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from functional import seq
4
+ import numpy as np
5
+ import torch.nn.functional as F
6
+ from torch import cosine_similarity
7
+ from config import Config
8
+ from openai import OpenAI
9
+
10
+ os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
11
+
12
+ class TextVector():
13
+ def __init__(self, cfg):
14
+ self.bert_path = cfg.bert_path
15
+
16
+ # 从配置文件读取API相关设置
17
+ self.use_api = getattr(cfg, 'use_api', True)
18
+ self.api_key = getattr(cfg, 'api_key', "sk-5b45aa67249a44d38abca3c02cc78a70")
19
+ self.base_url = getattr(cfg, 'base_url', "https://dashscope.aliyuncs.com/compatible-mode/v1")
20
+ self.model_name = getattr(cfg, 'model_name', "text-embedding-v3")
21
+ self.dimensions = getattr(cfg, 'dimensions', 1024)
22
+ self.batch_size = getattr(cfg, 'batch_size', 10)
23
+
24
+ # 只有在不使用API时才加载本地模型
25
+ if not self.use_api:
26
+ self.load_model()
27
+
28
+ def load_model(self):
29
+ """载入模型"""
30
+ self.tokenizer = AutoTokenizer.from_pretrained(self.bert_path)
31
+ self.model = AutoModel.from_pretrained(self.bert_path)
32
+
33
+ def mean_pooling(self, model_output, attention_mask):
34
+ """采用序列mean-pooling获得句子的表征向量"""
35
+ token_embeddings = model_output[0] # First element of model_output contains all token embeddings
36
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
37
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
38
+
39
+ def get_vec(self, sentences):
40
+ """通过模型获取句子的向量"""
41
+ if self.use_api:
42
+ # 如果使用API,重定向到API方法
43
+ return self.get_vec_api(sentences)
44
+
45
+ # 否则使用原始BERT方法
46
+ encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
47
+ with torch.no_grad():
48
+ model_output = self.model(**encoded_input)
49
+ sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
50
+ sentence_embeddings = sentence_embeddings.data.cpu().numpy().tolist()
51
+ return sentence_embeddings
52
+
53
+ def get_vec_api(self, query, batch_size=None):
54
+ """通过API获取句子的向量"""
55
+ if batch_size is None:
56
+ batch_size = self.batch_size
57
+
58
+ # 空查询检查
59
+ if not query:
60
+ print("Warning: Empty query provided to get_vec_api")
61
+ return []
62
+
63
+ client = OpenAI(
64
+ api_key=self.api_key,
65
+ base_url=self.base_url
66
+ )
67
+
68
+ if isinstance(query, str):
69
+ query = [query]
70
+
71
+ # 移除空字符串和None值,确保输入数据有效
72
+ query = [q for q in query if q and isinstance(q, str) and q.strip()]
73
+ if not query:
74
+ print("Warning: No valid text to vectorize after filtering")
75
+ return []
76
+
77
+ all_vectors = []
78
+ retry_count = 0
79
+ max_retries = 2 # 允许重试几次
80
+
81
+ while retry_count <= max_retries and not all_vectors:
82
+ try:
83
+ for i in range(0, len(query), batch_size):
84
+ batch = query[i:i + batch_size]
85
+ try:
86
+ completion = client.embeddings.create(
87
+ model=self.model_name,
88
+ input=batch,
89
+ dimensions=self.dimensions,
90
+ encoding_format="float"
91
+ )
92
+ vectors = [embedding.embedding for embedding in completion.data]
93
+ all_vectors.extend(vectors)
94
+ except Exception as e:
95
+ print(f"向量化批次 {i//batch_size + 1} 失败:{str(e)}")
96
+ # 不立即返回空数组,继续处理其他批次
97
+ continue
98
+
99
+ # 检查是否有成功获取的向量
100
+ if all_vectors:
101
+ break
102
+ else:
103
+ retry_count += 1
104
+ print(f"未获取到任何向量,第 {retry_count} 次重试...")
105
+
106
+ except Exception as outer_e:
107
+ print(f"向量化过程中发生错误:{str(outer_e)}")
108
+ retry_count += 1
109
+ if retry_count <= max_retries:
110
+ print(f"第 {retry_count} 次重试...")
111
+
112
+ # 返回向量数组,如果仍然为空,确保返回一个正确形状的空数组
113
+ if not all_vectors and self.dimensions > 0:
114
+ print("Warning: 返回一个空的向量数组,形状为 [0, dimensions]")
115
+ return np.zeros((0, self.dimensions))
116
+
117
+ return all_vectors
118
+
119
+ def get_vec_batch(self, data, bs=None):
120
+ """batch方式获取,提高效率"""
121
+ if bs is None:
122
+ bs = self.batch_size
123
+
124
+ if self.use_api:
125
+ # 如果使用API,直接调用API方法
126
+ vectors = self.get_vec_api(data, bs)
127
+ return torch.tensor(np.array(vectors)) if len(vectors) > 0 else torch.tensor(np.array([]))
128
+
129
+ # 否则使用原始BERT方法
130
+ data = seq(data).grouped(bs)
131
+ all_vectors = []
132
+ for batch in data:
133
+ vecs = self.get_vec(batch)
134
+ all_vectors.extend(vecs)
135
+ all_vectors = torch.tensor(np.array(all_vectors))
136
+ return all_vectors
137
+
138
+ def vector_similarity(self, vectors):
139
+ """以[query,text1,text2...]来计算query与text1,text2,...的cosine相似度"""
140
+ # Add dimension checking to prevent errors
141
+ if vectors.size(0) <= 1:
142
+ print("Warning: Not enough vectors for similarity calculation")
143
+ return []
144
+
145
+ if len(vectors.shape) < 2:
146
+ print("Warning: Vectors must be 2-dimensional")
147
+ return []
148
+
149
+ vectors = F.normalize(vectors, p=2, dim=1)
150
+ q_vec = vectors[0,:]
151
+ o_vec = vectors[1:,:]
152
+ sim = cosine_similarity(q_vec, o_vec)
153
+ sim = sim.data.cpu().numpy().tolist()
154
+ return sim
155
+
156
+ # 修正函数名拼写错误:get_vec_bath -> get_vec_batch
157
+ cfg = Config()
158
+ tv = TextVector(cfg)
159
+ get_vector = tv.get_vec_batch # 修正名称
160
+ get_sim = tv.vector_similarity