Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import re
|
|
| 5 |
from groq import Groq
|
| 6 |
import nltk
|
| 7 |
import logging
|
| 8 |
-
import sys
|
| 9 |
|
| 10 |
# Set up basic logging
|
| 11 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
@@ -88,6 +88,9 @@ def clean_word_for_display(word):
|
|
| 88 |
|
| 89 |
# Helper to check if a token is purely punctuation
|
| 90 |
def is_punctuation(token):
|
|
|
|
|
|
|
|
|
|
| 91 |
# This regex matches any string composed solely of non-alphanumeric characters and no whitespace
|
| 92 |
return bool(re.fullmatch(r'[^\w\s]+', token))
|
| 93 |
|
|
@@ -161,18 +164,25 @@ def teach_english_story(pdf_file):
|
|
| 161 |
|
| 162 |
if not raw_story_text.strip():
|
| 163 |
logging.warning("Extracted PDF text is empty or only whitespace.")
|
| 164 |
-
output_lessons.append("The uploaded PDF appears to be empty or contains no extractable text.")
|
| 165 |
return "\n".join(output_lessons)
|
| 166 |
|
| 167 |
# --- New Logging for Debugging ---
|
| 168 |
logging.info(f"Successfully extracted text. Length: {len(raw_story_text)} characters.")
|
|
|
|
| 169 |
logging.info(f"Extracted text snippet (first 500 chars): {raw_story_text[:500]}...")
|
| 170 |
# --- End New Logging ---
|
| 171 |
|
| 172 |
# Use NLTK for robust sentence tokenization
|
| 173 |
logging.info("Attempting to tokenize sentences using NLTK.")
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
for i, sentence in enumerate(sentences):
|
| 178 |
output_lessons.append(f"\n--- Sentence {i+1} ---")
|
|
@@ -180,6 +190,7 @@ def teach_english_story(pdf_file):
|
|
| 180 |
|
| 181 |
# Use NLTK to tokenize words and punctuation separately
|
| 182 |
word_tokens = nltk.word_tokenize(sentence)
|
|
|
|
| 183 |
|
| 184 |
# Word-by-word translation
|
| 185 |
output_lessons.append("\nWord-by-Word Translation:")
|
|
|
|
| 5 |
from groq import Groq
|
| 6 |
import nltk
|
| 7 |
import logging
|
| 8 |
+
import sys
|
| 9 |
|
| 10 |
# Set up basic logging
|
| 11 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
| 88 |
|
| 89 |
# Helper to check if a token is purely punctuation
|
| 90 |
def is_punctuation(token):
|
| 91 |
+
"""
|
| 92 |
+
Checks if a token consists solely of punctuation characters.
|
| 93 |
+
"""
|
| 94 |
# This regex matches any string composed solely of non-alphanumeric characters and no whitespace
|
| 95 |
return bool(re.fullmatch(r'[^\w\s]+', token))
|
| 96 |
|
|
|
|
| 164 |
|
| 165 |
if not raw_story_text.strip():
|
| 166 |
logging.warning("Extracted PDF text is empty or only whitespace.")
|
| 167 |
+
output_lessons.append("The uploaded PDF appears to be empty or contains no extractable text. Please ensure your PDF contains readable text, not just images.")
|
| 168 |
return "\n".join(output_lessons)
|
| 169 |
|
| 170 |
# --- New Logging for Debugging ---
|
| 171 |
logging.info(f"Successfully extracted text. Length: {len(raw_story_text)} characters.")
|
| 172 |
+
# Show only a snippet to avoid overwhelming logs with very large texts
|
| 173 |
logging.info(f"Extracted text snippet (first 500 chars): {raw_story_text[:500]}...")
|
| 174 |
# --- End New Logging ---
|
| 175 |
|
| 176 |
# Use NLTK for robust sentence tokenization
|
| 177 |
logging.info("Attempting to tokenize sentences using NLTK.")
|
| 178 |
+
try:
|
| 179 |
+
sentences = nltk.sent_tokenize(raw_story_text)
|
| 180 |
+
logging.info(f"Story split into {len(sentences)} sentences.")
|
| 181 |
+
except Exception as e:
|
| 182 |
+
logging.error(f"Error during NLTK sentence tokenization: {e}")
|
| 183 |
+
output_lessons.append(f"Processing Error: Could not split story into sentences. This might be due to unusual text formatting in the PDF. Error: {e}")
|
| 184 |
+
return "\n".join(output_lessons)
|
| 185 |
+
|
| 186 |
|
| 187 |
for i, sentence in enumerate(sentences):
|
| 188 |
output_lessons.append(f"\n--- Sentence {i+1} ---")
|
|
|
|
| 190 |
|
| 191 |
# Use NLTK to tokenize words and punctuation separately
|
| 192 |
word_tokens = nltk.word_tokenize(sentence)
|
| 193 |
+
logging.debug(f"Tokens for sentence {i+1}: {word_tokens}") # Detailed logging for tokens
|
| 194 |
|
| 195 |
# Word-by-word translation
|
| 196 |
output_lessons.append("\nWord-by-Word Translation:")
|