ahmedumeraziz commited on
Commit
2fd83fa
·
verified ·
1 Parent(s): cf014df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -18
app.py CHANGED
@@ -14,27 +14,27 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
14
  # Define a writable directory for NLTK data in Hugging Face Spaces
15
  NLTK_DATA_DIR = "/tmp/nltk_data"
16
 
17
- # Add this directory to NLTK's data path
 
 
 
 
18
  if NLTK_DATA_DIR not in nltk.data.path:
19
- nltk.data.path.append(NLTK_DATA_DIR)
20
- logging.info(f"Added {NLTK_DATA_DIR} to NLTK data path.")
21
 
22
- # Ensure 'punkt' tokenizer is downloaded.
23
- # This must happen before any `nltk.sent_tokenize` or `nltk.word_tokenize` calls.
24
  try:
25
- # Try to find 'punkt' in the specified download directory first
26
- nltk.data.find('tokenizers/punkt', paths=[NLTK_DATA_DIR])
27
- logging.info("NLTK 'punkt' tokenizer already downloaded to /tmp/nltk_data.")
28
- except LookupError:
29
- logging.info("NLTK 'punkt' tokenizer not found, attempting to download to /tmp/nltk_data...")
30
- try:
31
- # Download 'punkt' explicitly to the writable directory
32
- nltk.download('punkt', download_dir=NLTK_DATA_DIR)
33
- logging.info("NLTK 'punkt' tokenizer downloaded successfully to /tmp/nltk_data.")
34
- except Exception as e:
35
- logging.error(f"Failed to download NLTK 'punkt' tokenizer: {e}")
36
- # Re-raise the error as it's a critical dependency for the application to function.
37
- raise
38
 
39
  # --- Groq API Configuration ---
40
  # IMPORTANT: It's highly recommended to set your GROQ_API_KEY as an environment variable
 
14
  # Define a writable directory for NLTK data in Hugging Face Spaces
15
  NLTK_DATA_DIR = "/tmp/nltk_data"
16
 
17
+ # Ensure the NLTK data directory exists
18
+ os.makedirs(NLTK_DATA_DIR, exist_ok=True)
19
+ logging.info(f"Ensured NLTK data directory exists: {NLTK_DATA_DIR}")
20
+
21
+ # Add this directory to NLTK's data path and prioritize it
22
  if NLTK_DATA_DIR not in nltk.data.path:
23
+ nltk.data.path.insert(0, NLTK_DATA_DIR) # Use insert(0) to prioritize this path
24
+ logging.info(f"Prioritized {NLTK_DATA_DIR} in NLTK data path.")
25
 
26
+ # Attempt to download 'punkt' if it's not fully accessible or missing sub-components like 'punkt_tab'.
27
+ # This step is crucial and will run every time the app starts to ensure the resource is available.
28
  try:
29
+ logging.info("Attempting to verify/download NLTK 'punkt' tokenizer to ensure all components are available...")
30
+ # This will download 'punkt' if not found, or verify if already there.
31
+ # quiet=False provides more verbose output during download in logs.
32
+ nltk.download('punkt', download_dir=NLTK_DATA_DIR, quiet=False)
33
+ logging.info("NLTK 'punkt' tokenizer confirmed (or downloaded) successfully.")
34
+ except Exception as e:
35
+ logging.error(f"Critical error: Failed to download NLTK 'punkt' tokenizer (this might affect 'punkt_tab'): {e}")
36
+ # Re-raise the error as the application cannot proceed without this resource.
37
+ raise
 
 
 
 
38
 
39
  # --- Groq API Configuration ---
40
  # IMPORTANT: It's highly recommended to set your GROQ_API_KEY as an environment variable