Siyuan0730 commited on
Commit
aa649cc
·
1 Parent(s): 7b507e8
Files changed (1) hide show
  1. app.py +12 -11
app.py CHANGED
@@ -27,13 +27,10 @@ def chunkstring(string, length):
27
 
28
  def pdf_parser(input_pdf):
29
  pdf = PdfReader(input_pdf)
30
- content = ""
31
  for page in pdf.pages:
32
- content += page.extract_text()
33
- output_file = io.StringIO()
34
- output_file.write(content)
35
- output_file.seek(0)
36
- return output_file.getvalue().encode('utf-8')
37
 
38
  def get_keywords(file_paths): #这里的重点是,对每一个file做尽可能简短且覆盖全面的summarization
39
  download_nltk()
@@ -221,11 +218,15 @@ def initialize_file(added_files):
221
  with st.spinner('Processing file...'):
222
  for added_file in added_files:
223
  if added_file.name.endswith(".pdf"):
224
- added_file = pdf_parser(added_file)
225
- with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as tmp:
226
- tmp.write(added_file.getvalue())
227
- tmp_path = tmp.name
228
- temp_file_paths.append(tmp_path)
 
 
 
 
229
  st.success('Processing file...Done')
230
  return temp_file_paths
231
 
 
27
 
28
  def pdf_parser(input_pdf):
29
  pdf = PdfReader(input_pdf)
30
+ pdf_content = ""
31
  for page in pdf.pages:
32
+ pdf_content += page.extract_text()
33
+ return pdf_content
 
 
 
34
 
35
  def get_keywords(file_paths): #这里的重点是,对每一个file做尽可能简短且覆盖全面的summarization
36
  download_nltk()
 
218
  with st.spinner('Processing file...'):
219
  for added_file in added_files:
220
  if added_file.name.endswith(".pdf"):
221
+ string = pdf_parser(added_file)
222
+ with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp:
223
+ tmp.write(string.encode("utf-8"))
224
+ tmp_path = tmp.name
225
+ else:
226
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as tmp:
227
+ tmp.write(added_file.getvalue())
228
+ tmp_path = tmp.name
229
+ temp_file_paths.append(tmp_path)
230
  st.success('Processing file...Done')
231
  return temp_file_paths
232