ahmedumeraziz commited on
Commit
cf014df
·
verified ·
1 Parent(s): 64561bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -4
app.py CHANGED
@@ -5,7 +5,7 @@ import re
5
  from groq import Groq
6
  import nltk
7
  import logging
8
- import sys # Import sys for path manipulation if needed, though /tmp approach is often cleaner
9
 
10
  # Set up basic logging
11
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -88,6 +88,9 @@ def clean_word_for_display(word):
88
 
89
  # Helper to check if a token is purely punctuation
90
  def is_punctuation(token):
 
 
 
91
  # This regex matches any string composed solely of non-alphanumeric characters and no whitespace
92
  return bool(re.fullmatch(r'[^\w\s]+', token))
93
 
@@ -161,18 +164,25 @@ def teach_english_story(pdf_file):
161
 
162
  if not raw_story_text.strip():
163
  logging.warning("Extracted PDF text is empty or only whitespace.")
164
- output_lessons.append("The uploaded PDF appears to be empty or contains no extractable text.")
165
  return "\n".join(output_lessons)
166
 
167
  # --- New Logging for Debugging ---
168
  logging.info(f"Successfully extracted text. Length: {len(raw_story_text)} characters.")
 
169
  logging.info(f"Extracted text snippet (first 500 chars): {raw_story_text[:500]}...")
170
  # --- End New Logging ---
171
 
172
  # Use NLTK for robust sentence tokenization
173
  logging.info("Attempting to tokenize sentences using NLTK.")
174
- sentences = nltk.sent_tokenize(raw_story_text)
175
- logging.info(f"Story split into {len(sentences)} sentences.")
 
 
 
 
 
 
176
 
177
  for i, sentence in enumerate(sentences):
178
  output_lessons.append(f"\n--- Sentence {i+1} ---")
@@ -180,6 +190,7 @@ def teach_english_story(pdf_file):
180
 
181
  # Use NLTK to tokenize words and punctuation separately
182
  word_tokens = nltk.word_tokenize(sentence)
 
183
 
184
  # Word-by-word translation
185
  output_lessons.append("\nWord-by-Word Translation:")
 
5
  from groq import Groq
6
  import nltk
7
  import logging
8
+ import sys
9
 
10
  # Set up basic logging
11
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
88
 
89
  # Helper to check if a token is purely punctuation
90
  def is_punctuation(token):
91
+ """
92
+ Checks if a token consists solely of punctuation characters.
93
+ """
94
  # This regex matches any string composed solely of non-alphanumeric characters and no whitespace
95
  return bool(re.fullmatch(r'[^\w\s]+', token))
96
 
 
164
 
165
  if not raw_story_text.strip():
166
  logging.warning("Extracted PDF text is empty or only whitespace.")
167
+ output_lessons.append("The uploaded PDF appears to be empty or contains no extractable text. Please ensure your PDF contains readable text, not just images.")
168
  return "\n".join(output_lessons)
169
 
170
  # --- New Logging for Debugging ---
171
  logging.info(f"Successfully extracted text. Length: {len(raw_story_text)} characters.")
172
+ # Show only a snippet to avoid overwhelming logs with very large texts
173
  logging.info(f"Extracted text snippet (first 500 chars): {raw_story_text[:500]}...")
174
  # --- End New Logging ---
175
 
176
  # Use NLTK for robust sentence tokenization
177
  logging.info("Attempting to tokenize sentences using NLTK.")
178
+ try:
179
+ sentences = nltk.sent_tokenize(raw_story_text)
180
+ logging.info(f"Story split into {len(sentences)} sentences.")
181
+ except Exception as e:
182
+ logging.error(f"Error during NLTK sentence tokenization: {e}")
183
+ output_lessons.append(f"Processing Error: Could not split story into sentences. This might be due to unusual text formatting in the PDF. Error: {e}")
184
+ return "\n".join(output_lessons)
185
+
186
 
187
  for i, sentence in enumerate(sentences):
188
  output_lessons.append(f"\n--- Sentence {i+1} ---")
 
190
 
191
  # Use NLTK to tokenize words and punctuation separately
192
  word_tokens = nltk.word_tokenize(sentence)
193
+ logging.debug(f"Tokens for sentence {i+1}: {word_tokens}") # Detailed logging for tokens
194
 
195
  # Word-by-word translation
196
  output_lessons.append("\nWord-by-Word Translation:")