nlp-ultimate-tutor / templates /tokenization.html
aradhyapavan's picture
nlp ultimate tutor
ca2c89c verified
raw
history blame
12.2 kB
{% extends "base.html" %}
{% block title %}Tokenization - NLP Ultimate Tutorial{% endblock %}
{% block content %}
<div class="container">
<!-- Header Section -->
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h1 class="mb-0">
<i class="fas fa-cut"></i>
Tokenization
</h1>
</div>
<div class="card-body">
<p class="lead">Break text into smaller units called tokens using various tokenization methods.</p>
<div class="alert alert-info">
<i class="fas fa-info-circle"></i>
<strong>About:</strong> Tokenization is the process of breaking text into smaller units called tokens, which can be words, characters, or subwords.
</div>
</div>
</div>
</div>
</div>
{% include "_analysis_nav.html" %}
<!-- Text Input Section -->
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h3 class="mb-0">
<i class="fas fa-keyboard"></i>
Enter your text:
</h3>
</div>
<div class="card-body">
<div class="row mb-3">
<div class="col-md-8">
<textarea id="textInput" class="form-control" rows="6" placeholder="Enter or paste your text here...">The quick brown fox jumps over the lazy dog. It was a beautiful day in May of 2023!</textarea>
</div>
<div class="col-md-4">
<label for="sampleSelect" class="form-label">Or choose a sample:</label>
<select id="sampleSelect" class="form-select">
<option value="Custom">Custom</option>
<option value="News Article">News Article</option>
<option value="Product Review">Product Review</option>
<option value="Scientific Text">Scientific Text</option>
<option value="Literary Text">Literary Text</option>
</select>
</div>
</div>
<div class="d-flex justify-content-between align-items-center">
<div>
<button id="processBtn" class="btn btn-primary btn-lg">
<i class="fas fa-cut"></i>
Analyze Tokens
</button>
</div>
<div>
<button id="clearBtn" class="btn btn-outline-secondary">
<i class="fas fa-trash"></i>
Clear
</button>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- Tokenization Methods Info -->
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h3 class="mb-0">
<i class="fas fa-info-circle"></i>
Tokenization Methods
</h3>
</div>
<div class="card-body">
<div class="row">
<div class="col-md-3">
<div class="card h-100">
<div class="card-body text-center">
<i class="fas fa-font fa-2x text-primary mb-2"></i>
<h5>Word Tokenization</h5>
<p class="small">Splits text into individual words and punctuation marks using NLTK.</p>
</div>
</div>
</div>
<div class="col-md-3">
<div class="card h-100">
<div class="card-body text-center">
<i class="fas fa-paragraph fa-2x text-success mb-2"></i>
<h5>Sentence Tokenization</h5>
<p class="small">Divides text into sentences using punctuation and linguistic rules.</p>
</div>
</div>
</div>
<div class="col-md-3">
<div class="card h-100">
<div class="card-body text-center">
<i class="fas fa-brain fa-2x text-info mb-2"></i>
<h5>Linguistic Tokenization</h5>
<p class="small">Advanced tokenization with spaCy including POS tags and dependencies.</p>
</div>
</div>
</div>
<div class="col-md-3">
<div class="card h-100">
<div class="card-body text-center">
<i class="fas fa-puzzle-piece fa-2x text-warning mb-2"></i>
<h5>Subword Tokenization</h5>
<p class="small">Breaks words into smaller units using BERT WordPiece and GPT-2 BPE.</p>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- Results Section -->
<div class="row">
<div class="col-12">
<div class="card">
<div class="card-header">
<h3 class="mb-0">
<i class="fas fa-chart-bar"></i>
Tokenization Results
</h3>
</div>
<div class="card-body">
<div id="resultsContainer">
<div class="text-center text-muted py-5">
<i class="fas fa-arrow-up fa-2x mb-3"></i>
<p>Click "Analyze Tokens" to see tokenization results</p>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
{% endblock %}
{% block extra_scripts %}
<script>
// Initialize page
document.addEventListener('DOMContentLoaded', function() {
// Only carry over when using Quick Nav; otherwise leave defaults
const shouldCarry = sessionStorage.getItem('carryTextOnNextPage') === '1';
if (shouldCarry) {
const sampleSel = document.getElementById('sampleSelect');
if (sampleSel) sampleSel.value = 'Custom';
const storedText = sessionStorage.getItem('analysisText');
if (storedText) document.getElementById('textInput').value = storedText;
sessionStorage.removeItem('carryTextOnNextPage');
}
// Sample text dropdown handler
document.getElementById('sampleSelect').addEventListener('change', function() {
const sampleType = this.value;
const textInput = document.getElementById('textInput');
if (sampleType === 'Custom') {
textInput.value = '';
} else {
// Get sample text from server
fetch('/api/sample-text', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({sample_type: sampleType})
})
.then(response => response.json())
.then(data => {
textInput.value = data.text;
});
}
});
// Process button handler
document.getElementById('processBtn').addEventListener('click', function() {
const text = document.getElementById('textInput').value.trim();
if (!text) {
alert('Please enter some text to tokenize.');
return;
}
// Show loading state
this.innerHTML = '<i class="fas fa-spinner fa-spin"></i> Processing...';
this.disabled = true;
// Process text
processTokenization();
// Reset button after a delay
setTimeout(() => {
this.innerHTML = '<i class="fas fa-cut"></i> Analyze Tokens';
this.disabled = false;
}, 2000);
});
// Clear button handler
document.getElementById('clearBtn').addEventListener('click', function() {
document.getElementById('textInput').value = '';
document.getElementById('resultsContainer').innerHTML = `
<div class="text-center text-muted py-5">
<i class="fas fa-arrow-up fa-2x mb-3"></i>
<p>Click "Analyze Tokens" to see tokenization results</p>
</div>
`;
});
// Keyboard shortcuts
document.addEventListener('keydown', function(e) {
// Ctrl+Enter to process
if (e.ctrlKey && e.key === 'Enter') {
document.getElementById('processBtn').click();
}
// Ctrl+L to clear
if (e.ctrlKey && e.key === 'l') {
e.preventDefault();
document.getElementById('clearBtn').click();
}
});
});
// Process tokenization
function processTokenization() {
const text = document.getElementById('textInput').value.trim();
if (!text) {
alert('Please enter some text to tokenize.');
return;
}
showLoading('resultsContainer');
fetch('/api/tokenization', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({text: text})
})
.then(response => response.json())
.then(data => {
if (data.success) {
displayResults(data.result);
} else {
showError(data.error || 'An error occurred while processing the text');
}
})
.catch(error => {
showError('Failed to process text: ' + error.message);
})
.finally(() => {
hideLoading('resultsContainer');
});
}
// Show loading state
function showLoading(elementId) {
const element = document.getElementById(elementId);
if (element) {
element.innerHTML = `
<div class="text-center py-4">
<div class="spinner-border text-primary" role="status">
<span class="visually-hidden">Loading...</span>
</div>
<p class="mt-2">Analyzing tokens...</p>
</div>
`;
}
}
// Hide loading state
function hideLoading(elementId) {
const element = document.getElementById(elementId);
if (element && element.innerHTML.includes('spinner-border')) {
element.innerHTML = '';
}
}
// Show error message
function showError(message, elementId = 'resultsContainer') {
const element = document.getElementById(elementId);
if (element) {
element.innerHTML = `
<div class="alert alert-danger fade-in">
<i class="fas fa-exclamation-triangle"></i>
<strong>Error:</strong> ${message}
</div>
`;
}
}
// Display results
function displayResults(result) {
const container = document.getElementById('resultsContainer');
if (container) {
container.innerHTML = result;
container.classList.add('fade-in');
// Scroll to results
container.scrollIntoView({ behavior: 'smooth', block: 'start' });
}
}
</script>
{% endblock %}