What is a Token?
Roughly 4 characters of text. “Hello, world!” ≈ 3 tokens
Understanding and optimizing token usage is crucial for efficient AI-assisted development. This guide covers advanced strategies for managing tokens, reducing costs, and maximizing the value of every AI interaction.
What is a Token?
Roughly 4 characters of text. “Hello, world!” ≈ 3 tokens
Context Window
Maximum tokens per request (8K to 1M+ depending on model)
Pricing Structure
Charged per 1K tokens for both input and output
Token Limits
Rate limits and monthly quotas vary by plan
// Token estimation utilitiesclass TokenCalculator { // Rough estimation: 1 token ≈ 4 characters private readonly CHARS_PER_TOKEN = 4;
estimateTokens(text: string): number { // More accurate estimation considering: // - Whitespace and punctuation // - Code syntax overhead // - Special characters
const baseTokens = text.length / this.CHARS_PER_TOKEN; const codeMultiplier = this.getCodeMultiplier(text);
return Math.ceil(baseTokens * codeMultiplier); }
private getCodeMultiplier(text: string): number { const indicators = { hasCode: /```[\s\S]*```/.test(text), hasJSON: /\{[\s\S]*\}/.test(text), hasSpecialChars: /[^\x00-\x7F]/.test(text), hasIndentation: /^\s{2,}/m.test(text) };
let multiplier = 1.0; if (indicators.hasCode) multiplier *= 1.2; if (indicators.hasJSON) multiplier *= 1.15; if (indicators.hasSpecialChars) multiplier *= 1.1; if (indicators.hasIndentation) multiplier *= 1.05;
return multiplier; }}
// Intelligent context layeringclass ContextLayerManager { private layers = { immediate: 2000, // Current file + direct deps relevant: 5000, // Related files in module extended: 10000, // Broader codebase context reference: 20000 // Documentation and examples };
async buildOptimalContext(task: Task): Promise<Context> { const context = new Context();
// Start with immediate context context.add(await this.getImmediateContext(task));
// Add layers based on task complexity if (task.complexity > 5) { context.add(await this.getRelevantContext(task)); }
if (task.requiresArchitecturalKnowledge) { context.add(await this.getExtendedContext(task)); }
// Always leave room for response const responseBuffer = this.estimateResponseSize(task); return this.pruneContext(context, responseBuffer); }
private pruneContext(context: Context, responseBuffer: number): Context { const maxTokens = this.getMaxTokensForModel() - responseBuffer;
if (context.tokenCount <= maxTokens) { return context; }
// Intelligent pruning return this.intelligentPrune(context, maxTokens); }}
// Remove unnecessary tokens from code contextclass CodeMinifier { minifyForContext(code: string): string { // Remove comments (except JSDoc) code = code.replace(/\/\/(?!\/)[^\n]*/g, ''); code = code.replace(/\/\*(?![\*!])[^*]*\*+(?:[^/*][^*]*\*+)*\//g, '');
// Remove extra whitespace code = code.replace(/\s+/g, ' '); code = code.replace(/\s*([{}();,])\s*/g, '$1');
// Remove console.logs in context code = code.replace(/console\.(log|warn|error)\([^)]*\);?/g, '');
return code.trim(); }
preserveStructure(code: string): string { // Keep structure but minimize tokens return code .split('\n') .map(line => { // Preserve indentation structure const indent = line.match(/^\s*/)[0]; const content = line.trim();
// Skip empty lines if (!content) return '';
// Minimize but keep readable return indent + content; }) .filter(Boolean) .join('\n'); }}
// Compress by extracting semantic meaningclass SemanticCompressor { async compressContext(files: File[]): Promise<CompressedContext> { const signatures = []; const relationships = [];
for (const file of files) { // Extract only signatures and types const ast = await this.parseFile(file);
signatures.push({ file: file.path, exports: this.extractExports(ast), imports: this.extractImports(ast), types: this.extractTypes(ast) });
relationships.push(this.extractRelationships(ast)); }
return { signatures, relationships, summary: this.generateSummary(signatures, relationships) }; }}
// Use references instead of full contentclass ReferenceCompressor { compressWithReferences(context: Context): CompressedContext { const seen = new Map<string, string>(); const compressed = [];
for (const item of context.items) { const hash = this.hashContent(item.content);
if (seen.has(hash)) { // Replace with reference compressed.push({ type: 'reference', ref: seen.get(hash), path: item.path }); } else { // First occurrence const id = this.generateId(); seen.set(hash, id);
compressed.push({ type: 'definition', id, content: item.content, path: item.path }); } }
return { compressed, savings: this.calculateSavings(context, compressed) }; }}
// Choose optimal model based on token requirementsclass ModelSelector { private models = { 'claude-4-haiku': { contextWindow: 8192, costPer1kInput: 0.0003, costPer1kOutput: 0.0015, speed: 'fast', quality: 'good' }, 'claude-4-sonnet': { contextWindow: 200000, costPer1kInput: 0.003, costPer1kOutput: 0.015, speed: 'medium', quality: 'excellent' }, 'claude-4-opus': { contextWindow: 200000, costPer1kInput: 0.015, costPer1kOutput: 0.075, speed: 'slow', quality: 'best' }, 'gemini-2.5-pro': { contextWindow: 1000000, costPer1kInput: 0.002, costPer1kOutput: 0.008, speed: 'fast', quality: 'excellent' } };
selectOptimalModel(context: Context, task: Task): ModelChoice { const factors = { contextSize: context.tokenCount, taskComplexity: task.complexity, qualityRequired: task.qualityRequirement, budgetConstraint: task.maxCost, speedRequirement: task.urgency };
// Filter by context window const compatible = Object.entries(this.models) .filter(([_, model]) => model.contextWindow >= factors.contextSize);
// Score each model const scored = compatible.map(([name, model]) => ({ name, model, score: this.scoreModel(model, factors) }));
// Return best match return scored.sort((a, b) => b.score - a.score)[0]; }}
Task Type | Recommended Model | Context Strategy | Expected Tokens |
---|---|---|---|
Simple Edits | Haiku | Minimal (2-3K) | 500-1K output |
Feature Dev | Sonnet | Moderate (10-20K) | 2-5K output |
Refactoring | Sonnet/Opus | Extended (20-50K) | 5-10K output |
Architecture | Opus/Gemini | Full (50K+) | 10K+ output |
Bug Analysis | Gemini | Targeted (30K) | 3-5K output |
// Monitor token usage across sessionsclass TokenMonitor { private usage = new Map<string, TokenUsage>();
async trackRequest(request: AIRequest, response: AIResponse) { const usage: TokenUsage = { timestamp: new Date(), model: request.model, inputTokens: await this.countTokens(request.prompt), outputTokens: await this.countTokens(response.content), cost: this.calculateCost(request.model, inputTokens, outputTokens), task: request.metadata.task, user: request.metadata.user };
this.recordUsage(usage); this.checkAlerts(usage); }
async generateReport(period: Period): Promise<UsageReport> { const usage = this.getUsageForPeriod(period);
return { totalTokens: usage.reduce((sum, u) => sum + u.inputTokens + u.outputTokens, 0), totalCost: usage.reduce((sum, u) => sum + u.cost, 0), byModel: this.groupByModel(usage), byTask: this.groupByTask(usage), byUser: this.groupByUser(usage), trends: this.analyzeTrends(usage), recommendations: this.generateRecommendations(usage) }; }}
// Analytics for token optimizationclass TokenAnalytics { analyzePatterns(usage: TokenUsage[]): AnalysisResult { return { inefficientPatterns: this.findInefficiencies(usage), optimizationOpportunities: this.findOptimizations(usage), costSavingPotential: this.calculateSavings(usage), userBehaviors: this.analyzeUserPatterns(usage) }; }
private findInefficiencies(usage: TokenUsage[]): Inefficiency[] { const inefficiencies = [];
// Large context for simple tasks const oversizedContexts = usage.filter(u => u.task.complexity < 3 && u.inputTokens > 10000 );
// Repeated similar requests const duplicates = this.findDuplicateRequests(usage);
// Inefficient model selection const suboptimalModels = usage.filter(u => this.isSuboptimalModel(u) );
return [...oversizedContexts, ...duplicates, ...suboptimalModels]; }}
// Cache AI responses to reduce token usageclass ResponseCache { private cache = new LRUCache<string, CachedResponse>({ max: 1000, ttl: 1000 * 60 * 60 * 24, // 24 hours updateAgeOnGet: true });
async getCachedOrGenerate( prompt: string, generator: () => Promise<Response> ): Promise<Response> { const key = this.generateCacheKey(prompt); const cached = this.cache.get(key);
if (cached && this.isValid(cached)) { // Check if context has changed significantly if (await this.contextStillValid(cached)) { this.recordCacheHit(key); return cached.response; } }
// Generate new response const response = await generator();
// Cache if appropriate if (this.shouldCache(prompt, response)) { this.cache.set(key, { response, prompt, timestamp: Date.now(), contextHash: await this.hashContext() }); }
return response; }}
// Reuse parts of previous responsesclass PartialResponseCache { async findReusableSegments(newPrompt: string): Promise<ReusableSegment[]> { const segments = []; const similar = await this.findSimilarPrompts(newPrompt);
for (const cached of similar) { const reusable = this.extractReusableSegments( cached.prompt, cached.response, newPrompt );
segments.push(...reusable); }
return this.rankByRelevance(segments, newPrompt); }
buildPromptWithCache( basePrompt: string, segments: ReusableSegment[] ): EnhancedPrompt { return { prompt: basePrompt, context: segments.map(s => ({ type: 'cached_response', content: s.content, relevance: s.relevance })), expectedSavings: this.estimateTokenSavings(segments) }; }}
// Manage token budgets across teams and projectsclass TokenBudgetManager { private budgets = new Map<string, Budget>();
async allocateBudget(period: Period): Promise<AllocationPlan> { const totalBudget = this.getTotalBudget(period); const teams = await this.getTeams(); const historicalUsage = await this.getHistoricalUsage();
// Smart allocation based on multiple factors const allocations = teams.map(team => ({ team, allocation: this.calculateAllocation(team, { historicalUsage: historicalUsage[team.id], teamSize: team.size, projectPriority: team.priority, efficiency: this.calculateEfficiency(team) }) }));
return { period, totalBudget, allocations, rules: this.generateBudgetRules(allocations) }; }
enforcebudget(request: TokenRequest): Promise<boolean> { const budget = this.budgets.get(request.teamId);
if (!budget) return false;
const projected = budget.used + request.estimatedTokens;
if (projected > budget.limit) { // Check if request qualifies for exception if (this.qualifiesForException(request)) { return this.requestBudgetException(request); }
return false; }
return true; }}
Prompt Templates
Reuse optimized prompts for common tasks
Batch Processing
Combine multiple small requests into one
Progressive Enhancement
Start simple, add context only if needed
Off-Peak Usage
Schedule non-urgent tasks for lower rates
// Intelligently prune context based on relevanceclass DynamicContextPruner { async pruneContext( context: Context, targetTokens: number ): Promise<PrunedContext> { // Score each context item const scored = await Promise.all( context.items.map(async item => ({ item, score: await this.scoreRelevance(item, context.task) })) );
// Sort by relevance scored.sort((a, b) => b.score - a.score);
// Select items within token budget const selected = []; let tokenCount = 0;
for (const { item, score } of scored) { const itemTokens = await this.countTokens(item);
if (tokenCount + itemTokens <= targetTokens) { selected.push(item); tokenCount += itemTokens; } else if (score > 0.8) { // Try to compress high-value items const compressed = await this.compress(item); if (tokenCount + compressed.tokens <= targetTokens) { selected.push(compressed.item); tokenCount += compressed.tokens; } } }
return { items: selected, totalTokens: tokenCount, prunedItems: scored.length - selected.length }; }}
// Reuse tokens from previous interactionsclass TokenRecycler { async recycleTokens(conversation: Conversation): Promise<RecycledContext> { const messages = conversation.messages; const recycled = [];
// Identify reusable segments for (let i = 0; i < messages.length - 1; i++) { const message = messages[i];
if (this.isReusable(message)) { recycled.push({ content: this.extractReusableContent(message), summary: await this.summarize(message), tokens: await this.countTokens(message) }); } }
// Build compressed context return { summary: this.buildSummary(recycled), keyPoints: this.extractKeyPoints(recycled), savedTokens: this.calculateSavings(messages, recycled) }; }}
// Manage shared token pools for teamsclass SharedTokenPool { async drawFromPool(request: PoolRequest): Promise<TokenAllocation> { const pool = await this.getPool(request.teamId);
// Check availability if (pool.available < request.tokens) { // Try to borrow from other pools const borrowed = await this.borrowTokens( request.teamId, request.tokens - pool.available );
if (!borrowed.success) { throw new InsufficientTokensError(); } }
// Allocate tokens const allocation = { tokens: request.tokens, user: request.userId, purpose: request.purpose, timestamp: new Date(), expiresAt: this.calculateExpiry(request) };
await this.recordAllocation(allocation);
return allocation; }}
Monitor Continuously
Optimize Proactively
Educate Team
Iterate and Improve
Metric | Formula | Target |
---|---|---|
Token Efficiency | Output Value / Tokens Used | >0.8 |
Cache Hit Rate | Cached Responses / Total Requests | >30% |
Cost per Feature | Token Cost / Features Delivered | Decreasing |
Context Efficiency | Relevant Tokens / Total Context | >70% |
Remember: Effective token management isn’t about using fewer tokens—it’s about extracting maximum value from every token used. Focus on ROI, not just cost reduction.