Spaces:

ahmedumeraziz
/

English-AI-Teacher

Sleeping

App Files Files Community

ahmedumeraziz commited on Jun 16

Commit

cf014df

verified ·

1 Parent(s): 64561bd

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -4

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import re
 from groq import Groq
 import nltk
 import logging
-import sys # Import sys for path manipulation if needed, though /tmp approach is often cleaner
 # Set up basic logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -88,6 +88,9 @@ def clean_word_for_display(word):
 # Helper to check if a token is purely punctuation
 def is_punctuation(token):
     # This regex matches any string composed solely of non-alphanumeric characters and no whitespace
     return bool(re.fullmatch(r'[^\w\s]+', token))
@@ -161,18 +164,25 @@ def teach_english_story(pdf_file):
     if not raw_story_text.strip():
         logging.warning("Extracted PDF text is empty or only whitespace.")
-        output_lessons.append("The uploaded PDF appears to be empty or contains no extractable text.")
         return "\n".join(output_lessons)
     # --- New Logging for Debugging ---
     logging.info(f"Successfully extracted text. Length: {len(raw_story_text)} characters.")
     logging.info(f"Extracted text snippet (first 500 chars): {raw_story_text[:500]}...")
     # --- End New Logging ---
     # Use NLTK for robust sentence tokenization
     logging.info("Attempting to tokenize sentences using NLTK.")
-    sentences = nltk.sent_tokenize(raw_story_text)
-    logging.info(f"Story split into {len(sentences)} sentences.")
     for i, sentence in enumerate(sentences):
         output_lessons.append(f"\n--- Sentence {i+1} ---")
@@ -180,6 +190,7 @@ def teach_english_story(pdf_file):
         # Use NLTK to tokenize words and punctuation separately
         word_tokens = nltk.word_tokenize(sentence)
         # Word-by-word translation
         output_lessons.append("\nWord-by-Word Translation:")

 from groq import Groq
 import nltk
 import logging
+import sys
 # Set up basic logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # Helper to check if a token is purely punctuation
 def is_punctuation(token):
+    """
+    Checks if a token consists solely of punctuation characters.
+    """
     # This regex matches any string composed solely of non-alphanumeric characters and no whitespace
     return bool(re.fullmatch(r'[^\w\s]+', token))
     if not raw_story_text.strip():
         logging.warning("Extracted PDF text is empty or only whitespace.")
+        output_lessons.append("The uploaded PDF appears to be empty or contains no extractable text. Please ensure your PDF contains readable text, not just images.")
         return "\n".join(output_lessons)
     # --- New Logging for Debugging ---
     logging.info(f"Successfully extracted text. Length: {len(raw_story_text)} characters.")
+    # Show only a snippet to avoid overwhelming logs with very large texts
     logging.info(f"Extracted text snippet (first 500 chars): {raw_story_text[:500]}...")
     # --- End New Logging ---
     # Use NLTK for robust sentence tokenization
     logging.info("Attempting to tokenize sentences using NLTK.")
+    try:
+        sentences = nltk.sent_tokenize(raw_story_text)
+        logging.info(f"Story split into {len(sentences)} sentences.")
+    except Exception as e:
+        logging.error(f"Error during NLTK sentence tokenization: {e}")
+        output_lessons.append(f"Processing Error: Could not split story into sentences. This might be due to unusual text formatting in the PDF. Error: {e}")
+        return "\n".join(output_lessons)
     for i, sentence in enumerate(sentences):
         output_lessons.append(f"\n--- Sentence {i+1} ---")
         # Use NLTK to tokenize words and punctuation separately
         word_tokens = nltk.word_tokenize(sentence)
+        logging.debug(f"Tokens for sentence {i+1}: {word_tokens}") # Detailed logging for tokens
         # Word-by-word translation
         output_lessons.append("\nWord-by-Word Translation:")