Spaces:
Sleeping
Sleeping
Commit
·
aa649cc
1
Parent(s):
7b507e8
加油
Browse files
app.py
CHANGED
|
@@ -27,13 +27,10 @@ def chunkstring(string, length):
|
|
| 27 |
|
| 28 |
def pdf_parser(input_pdf):
|
| 29 |
pdf = PdfReader(input_pdf)
|
| 30 |
-
|
| 31 |
for page in pdf.pages:
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
output_file.write(content)
|
| 35 |
-
output_file.seek(0)
|
| 36 |
-
return output_file.getvalue().encode('utf-8')
|
| 37 |
|
| 38 |
def get_keywords(file_paths): #这里的重点是,对每一个file做尽可能简短且覆盖全面的summarization
|
| 39 |
download_nltk()
|
|
@@ -221,11 +218,15 @@ def initialize_file(added_files):
|
|
| 221 |
with st.spinner('Processing file...'):
|
| 222 |
for added_file in added_files:
|
| 223 |
if added_file.name.endswith(".pdf"):
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
st.success('Processing file...Done')
|
| 230 |
return temp_file_paths
|
| 231 |
|
|
|
|
| 27 |
|
| 28 |
def pdf_parser(input_pdf):
|
| 29 |
pdf = PdfReader(input_pdf)
|
| 30 |
+
pdf_content = ""
|
| 31 |
for page in pdf.pages:
|
| 32 |
+
pdf_content += page.extract_text()
|
| 33 |
+
return pdf_content
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
def get_keywords(file_paths): #这里的重点是,对每一个file做尽可能简短且覆盖全面的summarization
|
| 36 |
download_nltk()
|
|
|
|
| 218 |
with st.spinner('Processing file...'):
|
| 219 |
for added_file in added_files:
|
| 220 |
if added_file.name.endswith(".pdf"):
|
| 221 |
+
string = pdf_parser(added_file)
|
| 222 |
+
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp:
|
| 223 |
+
tmp.write(string.encode("utf-8"))
|
| 224 |
+
tmp_path = tmp.name
|
| 225 |
+
else:
|
| 226 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as tmp:
|
| 227 |
+
tmp.write(added_file.getvalue())
|
| 228 |
+
tmp_path = tmp.name
|
| 229 |
+
temp_file_paths.append(tmp_path)
|
| 230 |
st.success('Processing file...Done')
|
| 231 |
return temp_file_paths
|
| 232 |
|