Spaces:

Siyuan0730
/

OmniTutor

Sleeping

Siyuan0730 commited on Oct 31, 2023

Commit

aa649cc

1 Parent(s): 7b507e8

加油

Files changed (1) hide show

app.py CHANGED Viewed

@@ -27,13 +27,10 @@ def chunkstring(string, length):
 def pdf_parser(input_pdf):
     pdf = PdfReader(input_pdf)
-    content = ""
     for page in pdf.pages:
-        content += page.extract_text()
-    output_file = io.StringIO()
-    output_file.write(content)
-    output_file.seek(0)
-    return output_file.getvalue().encode('utf-8')
 def get_keywords(file_paths): #这里的重点是，对每一个file做尽可能简短且覆盖全面的summarization
     download_nltk()
@@ -221,11 +218,15 @@ def initialize_file(added_files):
     with st.spinner('Processing file...'):
         for added_file in added_files:
             if added_file.name.endswith(".pdf"):
-                added_file = pdf_parser(added_file)
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as tmp:
-                tmp.write(added_file.getvalue())
-                tmp_path = tmp.name
-                temp_file_paths.append(tmp_path)
     st.success('Processing file...Done')
     return temp_file_paths

 def pdf_parser(input_pdf):
     pdf = PdfReader(input_pdf)
+    pdf_content = ""
     for page in pdf.pages:
+        pdf_content += page.extract_text()
+    return pdf_content
 def get_keywords(file_paths): #这里的重点是，对每一个file做尽可能简短且覆盖全面的summarization
     download_nltk()
     with st.spinner('Processing file...'):
         for added_file in added_files:
             if added_file.name.endswith(".pdf"):
+                string = pdf_parser(added_file)
+                with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp:
+                    tmp.write(string.encode("utf-8"))
+                    tmp_path = tmp.name
+            else:
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as tmp:
+                    tmp.write(added_file.getvalue())
+                    tmp_path = tmp.name
+            temp_file_paths.append(tmp_path)
     st.success('Processing file...Done')
     return temp_file_paths