Batch Processing
Efficiently process multiple files or code snippets using batch processing strategies.
Why Batch Processing?
When processing multiple files, batch processing provides:
- Better Performance: Parallel processing with concurrency control
- Progress Tracking: Monitor processing status
- Error Handling: Graceful handling of individual failures
- Resource Management: Control memory and CPU usage
Basic Batch Processing
Processing Multiple Files
typescript
import { Tokenizer } from 'ts-syntax-highlighter'
import fs from 'node:fs/promises'
async function processFiles(files: string[], language: string) {
const tokenizer = new Tokenizer(language)
const results = new Map()
for (const file of files) {
const code = await fs.readFile(file, 'utf-8')
const tokens = await tokenizer.tokenizeAsync(code)
results.set(file, tokens)
}
return results
}
// Usage
const files = ['file1.ts', 'file2.ts', 'file3.ts']
const results = await processFiles(files, 'typescript')
Parallel Processing
typescript
async function processFilesParallel(files: string[], language: string) {
const tokenizer = new Tokenizer(language)
const results = await Promise.all(
files.map(async file => {
const code = await fs.readFile(file, 'utf-8')
const tokens = await tokenizer.tokenizeAsync(code)
return { file, tokens }
})
)
return new Map(results.map(r => [r.file, r.tokens]))
}
// Usage
const results = await processFilesParallel(files, 'typescript')
Controlled Concurrency
Limit Concurrent Processing
typescript
async function processWithConcurrency(
files: string[],
language: string,
concurrency: number = 5
) {
const tokenizer = new Tokenizer(language)
const results = new Map()
const queue = [...files]
async function processNext(): Promise<void> {
const file = queue.shift()
if (!file) return
try {
const code = await fs.readFile(file, 'utf-8')
const tokens = await tokenizer.tokenizeAsync(code)
results.set(file, tokens)
} catch (error) {
console.error(`Failed to process ${file}:`, error)
results.set(file, null)
}
if (queue.length > 0) {
await processNext()
}
}
// Start concurrent workers
const workers = Array.from(
{ length: Math.min(concurrency, files.length) },
() => processNext()
)
await Promise.all(workers)
return results
}
// Usage
const results = await processWithConcurrency(files, 'typescript', 10)
Progress Tracking
With Progress Callbacks
typescript
interface BatchConfig {
concurrency: number
onProgress?: (completed: number, total: number) => void
onError?: (file: string, error: Error) => void
onSuccess?: (file: string) => void
}
async function batchProcess(
files: string[],
language: string,
config: BatchConfig = { concurrency: 5 }
) {
const tokenizer = new Tokenizer(language)
const results = new Map()
let completed = 0
for (let i = 0; i < files.length; i += config.concurrency) {
const batch = files.slice(i, i + config.concurrency)
const batchResults = await Promise.all(
batch.map(async file => {
try {
const code = await fs.readFile(file, 'utf-8')
const tokens = await tokenizer.tokenizeAsync(code)
config.onSuccess?.(file)
return { file, tokens, error: null }
} catch (error) {
config.onError?.(file, error as Error)
return { file, tokens: null, error: error as Error }
}
})
)
batchResults.forEach(({ file, tokens }) => {
results.set(file, tokens)
completed++
config.onProgress?.(completed, files.length)
})
}
return results
}
// Usage
const results = await batchProcess(files, 'typescript', {
concurrency: 10,
onProgress: (done, total) => {
console.log(`Progress: ${done}/${total} (${Math.round(done / total * 100)}%)`)
},
onError: (file, error) => {
console.error(`❌ Failed: ${file}`, error.message)
},
onSuccess: (file) => {
console.log(`✅ Processed: ${file}`)
}
})
Advanced Batch Processing
With Retry Logic
typescript
async function processWithRetry(
file: string,
tokenizer: Tokenizer,
maxRetries: number = 3
) {
let lastError: Error | null = null
for (let attempt = 0; attempt < maxRetries; attempt++) {
try {
const code = await fs.readFile(file, 'utf-8')
return await tokenizer.tokenizeAsync(code)
} catch (error) {
lastError = error as Error
if (attempt < maxRetries - 1) {
// Wait before retry (exponential backoff)
await new Promise(resolve =>
setTimeout(resolve, Math.pow(2, attempt) * 100)
)
}
}
}
throw lastError
}
async function batchProcessWithRetry(
files: string[],
language: string,
maxRetries: number = 3
) {
const tokenizer = new Tokenizer(language)
const results = new Map()
const promises = files.map(async file => {
try {
const tokens = await processWithRetry(file, tokenizer, maxRetries)
results.set(file, { success: true, tokens })
} catch (error) {
results.set(file, { success: false, error: error as Error })
}
})
await Promise.all(promises)
return results
}
With Caching
typescript
class CachedBatchProcessor {
private cache = new Map<string, any>()
private tokenizer: Tokenizer
constructor(language: string) {
this.tokenizer = new Tokenizer(language)
}
async process(files: string[]) {
const results = new Map()
for (const file of files) {
// Check cache first
if (this.cache.has(file)) {
results.set(file, this.cache.get(file))
continue
}
// Process and cache
const code = await fs.readFile(file, 'utf-8')
const tokens = await this.tokenizer.tokenizeAsync(code)
this.cache.set(file, tokens)
results.set(file, tokens)
}
return results
}
clearCache() {
this.cache.clear()
}
getCacheSize() {
return this.cache.size
}
}
// Usage
const processor = new CachedBatchProcessor('typescript')
const results1 = await processor.process(files) // Processes all
const results2 = await processor.process(files) // Uses cache
Memory Management
Streaming Processing
typescript
async function streamProcess(
files: string[],
language: string,
callback: (file: string, tokens: LineTokens[]) => void
) {
const tokenizer = new Tokenizer(language)
for (const file of files) {
const code = await fs.readFile(file, 'utf-8')
const tokens = await tokenizer.tokenizeAsync(code)
// Process immediately, don't store
callback(file, tokens)
// Allow garbage collection
await new Promise(resolve => setImmediate(resolve))
}
}
// Usage
await streamProcess(files, 'typescript', (file, tokens) => {
// Process tokens immediately
const keywords = tokens.flatMap(line =>
line.tokens.filter(t => t.type.includes('keyword'))
)
console.log(`${file}: ${keywords.length} keywords`)
})
Chunked Processing
typescript
async function processInChunks(
files: string[],
language: string,
chunkSize: number = 100
) {
const tokenizer = new Tokenizer(language)
const allResults = []
for (let i = 0; i < files.length; i += chunkSize) {
const chunk = files.slice(i, i + chunkSize)
const chunkResults = await Promise.all(
chunk.map(async file => {
const code = await fs.readFile(file, 'utf-8')
return {
file,
tokens: await tokenizer.tokenizeAsync(code)
}
})
)
allResults.push(...chunkResults)
// Force garbage collection between chunks
if (global.gc) global.gc()
}
return allResults
}
Real-World Examples
Processing a Project
typescript
import { glob } from 'glob'
async function processProject(
pattern: string,
config: BatchConfig = { concurrency: 10 }
) {
// Find all TypeScript files
const files = await glob(pattern)
console.log(`Found ${files.length} files`)
// Group by language
const byLanguage = new Map<string, string[]>()
files.forEach(file => {
const ext = file.split('.').pop()
const lang = ext === 'ts' || ext === 'tsx' ? 'typescript' : 'javascript'
if (!byLanguage.has(lang)) {
byLanguage.set(lang, [])
}
byLanguage.get(lang)!.push(file)
})
// Process each language group
const results = new Map()
for (const [lang, langFiles] of byLanguage) {
console.log(`Processing ${langFiles.length} ${lang} files...`)
const langResults = await batchProcess(langFiles, lang, config)
langResults.forEach((tokens, file) => {
results.set(file, tokens)
})
}
return results
}
// Usage
const results = await processProject('src/**/*.{ts,tsx}', {
concurrency: 20,
onProgress: (done, total) => {
process.stdout.write(`\rProgress: ${done}/${total}`)
},
onError: (file, error) => {
console.error(`\n❌ ${file}: ${error.message}`)
}
})
console.log(`\n✅ Processed ${results.size} files`)
Generating Statistics
typescript
interface Stats {
totalFiles: number
totalLines: number
totalTokens: number
byType: Record<string, number>
}
async function generateStats(files: string[], language: string): Promise<Stats> {
const stats: Stats = {
totalFiles: 0,
totalLines: 0,
totalTokens: 0,
byType: {}
}
await streamProcess(files, language, (file, tokens) => {
stats.totalFiles++
stats.totalLines += tokens.length
tokens.forEach(line => {
line.tokens.forEach(token => {
stats.totalTokens++
// Count by type
const type = token.type.split('.')[0]
stats.byType[type] = (stats.byType[type] || 0) + 1
})
})
})
return stats
}
// Usage
const stats = await generateStats(files, 'typescript')
console.log(JSON.stringify(stats, null, 2))
Performance Considerations
Optimal Concurrency
typescript
import os from 'node:os'
// Use CPU count as baseline
const optimalConcurrency = os.cpus().length
// Adjust based on I/O vs CPU work
const ioBoundConcurrency = optimalConcurrency * 2
const cpuBoundConcurrency = optimalConcurrency
Memory Monitoring
typescript
function checkMemory() {
const usage = process.memoryUsage()
const usedMB = Math.round(usage.heapUsed / 1024 / 1024)
const totalMB = Math.round(usage.heapTotal / 1024 / 1024)
console.log(`Memory: ${usedMB}MB / ${totalMB}MB`)
// Warn if using > 80% of heap
if (usedMB / totalMB > 0.8) {
console.warn('⚠️ High memory usage')
}
}
// Check periodically during batch processing
const memoryInterval = setInterval(checkMemory, 5000)
await batchProcess(files, 'typescript', config)
clearInterval(memoryInterval)
Next Steps
- Learn about Code Analysis
- Explore Performance Tuning
- Check out Custom Themes