Spaces:
Running
Running
| {% extends "base.html" %} | |
| {% block title %}Tokenization - NLP Ultimate Tutorial{% endblock %} | |
| {% block content %} | |
| <div class="container"> | |
| <!-- Header Section --> | |
| <div class="row mb-4"> | |
| <div class="col-12"> | |
| <div class="card"> | |
| <div class="card-header"> | |
| <h1 class="mb-0"> | |
| <i class="fas fa-cut"></i> | |
| Tokenization | |
| </h1> | |
| </div> | |
| <div class="card-body"> | |
| <p class="lead">Break text into smaller units called tokens using various tokenization methods.</p> | |
| <div class="alert alert-info"> | |
| <i class="fas fa-info-circle"></i> | |
| <strong>About:</strong> Tokenization is the process of breaking text into smaller units called tokens, which can be words, characters, or subwords. | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| {% include "_analysis_nav.html" %} | |
| <!-- Text Input Section --> | |
| <div class="row mb-4"> | |
| <div class="col-12"> | |
| <div class="card"> | |
| <div class="card-header"> | |
| <h3 class="mb-0"> | |
| <i class="fas fa-keyboard"></i> | |
| Enter your text: | |
| </h3> | |
| </div> | |
| <div class="card-body"> | |
| <div class="row mb-3"> | |
| <div class="col-md-8"> | |
| <textarea id="textInput" class="form-control" rows="6" placeholder="Enter or paste your text here...">The quick brown fox jumps over the lazy dog. It was a beautiful day in May of 2023!</textarea> | |
| </div> | |
| <div class="col-md-4"> | |
| <label for="sampleSelect" class="form-label">Or choose a sample:</label> | |
| <select id="sampleSelect" class="form-select"> | |
| <option value="Custom">Custom</option> | |
| <option value="News Article">News Article</option> | |
| <option value="Product Review">Product Review</option> | |
| <option value="Scientific Text">Scientific Text</option> | |
| <option value="Literary Text">Literary Text</option> | |
| </select> | |
| </div> | |
| </div> | |
| <div class="d-flex justify-content-between align-items-center"> | |
| <div> | |
| <button id="processBtn" class="btn btn-primary btn-lg"> | |
| <i class="fas fa-cut"></i> | |
| Analyze Tokens | |
| </button> | |
| </div> | |
| <div> | |
| <button id="clearBtn" class="btn btn-outline-secondary"> | |
| <i class="fas fa-trash"></i> | |
| Clear | |
| </button> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Tokenization Methods Info --> | |
| <div class="row mb-4"> | |
| <div class="col-12"> | |
| <div class="card"> | |
| <div class="card-header"> | |
| <h3 class="mb-0"> | |
| <i class="fas fa-info-circle"></i> | |
| Tokenization Methods | |
| </h3> | |
| </div> | |
| <div class="card-body"> | |
| <div class="row"> | |
| <div class="col-md-3"> | |
| <div class="card h-100"> | |
| <div class="card-body text-center"> | |
| <i class="fas fa-font fa-2x text-primary mb-2"></i> | |
| <h5>Word Tokenization</h5> | |
| <p class="small">Splits text into individual words and punctuation marks using NLTK.</p> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="col-md-3"> | |
| <div class="card h-100"> | |
| <div class="card-body text-center"> | |
| <i class="fas fa-paragraph fa-2x text-success mb-2"></i> | |
| <h5>Sentence Tokenization</h5> | |
| <p class="small">Divides text into sentences using punctuation and linguistic rules.</p> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="col-md-3"> | |
| <div class="card h-100"> | |
| <div class="card-body text-center"> | |
| <i class="fas fa-brain fa-2x text-info mb-2"></i> | |
| <h5>Linguistic Tokenization</h5> | |
| <p class="small">Advanced tokenization with spaCy including POS tags and dependencies.</p> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="col-md-3"> | |
| <div class="card h-100"> | |
| <div class="card-body text-center"> | |
| <i class="fas fa-puzzle-piece fa-2x text-warning mb-2"></i> | |
| <h5>Subword Tokenization</h5> | |
| <p class="small">Breaks words into smaller units using BERT WordPiece and GPT-2 BPE.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Results Section --> | |
| <div class="row"> | |
| <div class="col-12"> | |
| <div class="card"> | |
| <div class="card-header"> | |
| <h3 class="mb-0"> | |
| <i class="fas fa-chart-bar"></i> | |
| Tokenization Results | |
| </h3> | |
| </div> | |
| <div class="card-body"> | |
| <div id="resultsContainer"> | |
| <div class="text-center text-muted py-5"> | |
| <i class="fas fa-arrow-up fa-2x mb-3"></i> | |
| <p>Click "Analyze Tokens" to see tokenization results</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| {% endblock %} | |
| {% block extra_scripts %} | |
| <script> | |
| // Initialize page | |
| document.addEventListener('DOMContentLoaded', function() { | |
| // Only carry over when using Quick Nav; otherwise leave defaults | |
| const shouldCarry = sessionStorage.getItem('carryTextOnNextPage') === '1'; | |
| if (shouldCarry) { | |
| const sampleSel = document.getElementById('sampleSelect'); | |
| if (sampleSel) sampleSel.value = 'Custom'; | |
| const storedText = sessionStorage.getItem('analysisText'); | |
| if (storedText) document.getElementById('textInput').value = storedText; | |
| sessionStorage.removeItem('carryTextOnNextPage'); | |
| } | |
| // Sample text dropdown handler | |
| document.getElementById('sampleSelect').addEventListener('change', function() { | |
| const sampleType = this.value; | |
| const textInput = document.getElementById('textInput'); | |
| if (sampleType === 'Custom') { | |
| textInput.value = ''; | |
| } else { | |
| // Get sample text from server | |
| fetch('/api/sample-text', { | |
| method: 'POST', | |
| headers: { | |
| 'Content-Type': 'application/json', | |
| }, | |
| body: JSON.stringify({sample_type: sampleType}) | |
| }) | |
| .then(response => response.json()) | |
| .then(data => { | |
| textInput.value = data.text; | |
| }); | |
| } | |
| }); | |
| // Process button handler | |
| document.getElementById('processBtn').addEventListener('click', function() { | |
| const text = document.getElementById('textInput').value.trim(); | |
| if (!text) { | |
| alert('Please enter some text to tokenize.'); | |
| return; | |
| } | |
| // Show loading state | |
| this.innerHTML = '<i class="fas fa-spinner fa-spin"></i> Processing...'; | |
| this.disabled = true; | |
| // Process text | |
| processTokenization(); | |
| // Reset button after a delay | |
| setTimeout(() => { | |
| this.innerHTML = '<i class="fas fa-cut"></i> Analyze Tokens'; | |
| this.disabled = false; | |
| }, 2000); | |
| }); | |
| // Clear button handler | |
| document.getElementById('clearBtn').addEventListener('click', function() { | |
| document.getElementById('textInput').value = ''; | |
| document.getElementById('resultsContainer').innerHTML = ` | |
| <div class="text-center text-muted py-5"> | |
| <i class="fas fa-arrow-up fa-2x mb-3"></i> | |
| <p>Click "Analyze Tokens" to see tokenization results</p> | |
| </div> | |
| `; | |
| }); | |
| // Keyboard shortcuts | |
| document.addEventListener('keydown', function(e) { | |
| // Ctrl+Enter to process | |
| if (e.ctrlKey && e.key === 'Enter') { | |
| document.getElementById('processBtn').click(); | |
| } | |
| // Ctrl+L to clear | |
| if (e.ctrlKey && e.key === 'l') { | |
| e.preventDefault(); | |
| document.getElementById('clearBtn').click(); | |
| } | |
| }); | |
| }); | |
| // Process tokenization | |
| function processTokenization() { | |
| const text = document.getElementById('textInput').value.trim(); | |
| if (!text) { | |
| alert('Please enter some text to tokenize.'); | |
| return; | |
| } | |
| showLoading('resultsContainer'); | |
| fetch('/api/tokenization', { | |
| method: 'POST', | |
| headers: { | |
| 'Content-Type': 'application/json', | |
| }, | |
| body: JSON.stringify({text: text}) | |
| }) | |
| .then(response => response.json()) | |
| .then(data => { | |
| if (data.success) { | |
| displayResults(data.result); | |
| } else { | |
| showError(data.error || 'An error occurred while processing the text'); | |
| } | |
| }) | |
| .catch(error => { | |
| showError('Failed to process text: ' + error.message); | |
| }) | |
| .finally(() => { | |
| hideLoading('resultsContainer'); | |
| }); | |
| } | |
| // Show loading state | |
| function showLoading(elementId) { | |
| const element = document.getElementById(elementId); | |
| if (element) { | |
| element.innerHTML = ` | |
| <div class="text-center py-4"> | |
| <div class="spinner-border text-primary" role="status"> | |
| <span class="visually-hidden">Loading...</span> | |
| </div> | |
| <p class="mt-2">Analyzing tokens...</p> | |
| </div> | |
| `; | |
| } | |
| } | |
| // Hide loading state | |
| function hideLoading(elementId) { | |
| const element = document.getElementById(elementId); | |
| if (element && element.innerHTML.includes('spinner-border')) { | |
| element.innerHTML = ''; | |
| } | |
| } | |
| // Show error message | |
| function showError(message, elementId = 'resultsContainer') { | |
| const element = document.getElementById(elementId); | |
| if (element) { | |
| element.innerHTML = ` | |
| <div class="alert alert-danger fade-in"> | |
| <i class="fas fa-exclamation-triangle"></i> | |
| <strong>Error:</strong> ${message} | |
| </div> | |
| `; | |
| } | |
| } | |
| // Display results | |
| function displayResults(result) { | |
| const container = document.getElementById('resultsContainer'); | |
| if (container) { | |
| container.innerHTML = result; | |
| container.classList.add('fade-in'); | |
| // Scroll to results | |
| container.scrollIntoView({ behavior: 'smooth', block: 'start' }); | |
| } | |
| } | |
| </script> | |
| {% endblock %} | |