Multimodal Browser AI with Transformers.js for Images and Speech

<title>Multimodal Media Analyzer</title>

* { box–sizing: border–box; margin: 0; padding: 0; }

body {

font–family: system–ui, sans–serif;

max–width: 820px;

margin: 0 auto;

padding: 1.5rem 1rem;

background: #f1f5f9;

color: #1e293b;

}

header { margin–bottom: 1.5rem; }

header h1 { font–size: 1.5rem; }

header p { color: #64748b; font-size: 0.9rem; margin-top: 0.2rem; }

/* Model status indicators */

.model–status–bar {

display: flex;

gap: 0.5rem;

flex–wrap: wrap;

margin–top: 0.75rem;

}

.model–badge {

font–size: 0.78rem;

padding: 0.2rem 0.6rem;

border–radius: 12px;

background: #fef3c7;

color: #92400e;

}

.model–badge.ready { background: #dcfce7; color: #15803d; }

/* Tab bar */

.tabs {

display: flex;

background: white;

border–radius: 8px;

padding: 0.25rem;

gap: 0.25rem;

margin–bottom: 1.25rem;

border: 1px solid #e2e8f0;

}

.tab {

flex: 1;

padding: 0.5rem;

text–align: center;

border–radius: 6px;

cursor: pointer;

font–size: 0.9rem;

color: #64748b;

transition: all 0.15s;

}

.tab.active { background: #2563eb; color: white; font-weight: 600; }

/* Input panels */

.panel { display: none; }

.panel.active { display: block; }

.upload–area {

background: white;

border: 2px dashed #cbd5e1;

border–radius: 8px;

padding: 2rem;

text–align: center;

cursor: pointer;

}

.upload–area input { display: none; }

#img-preview {

margin–top: 1rem;

max–width: 100%;

max–height: 320px;

border–radius: 8px;

display: none;

object–fit: cover;

}

.mic–center { text–align: center; padding: 1rem 0; }

#rec-btn {

width: 72px; height: 72px;

border–radius: 50%; border: none;

background: #dc2626; color: white;

font–size: 1.6rem; cursor: pointer;

display: flex; align–items: center; justify–content: center;

margin: 0 auto 0.5rem;

}

#rec-btn.recording { background: #374151; }

#rec-btn:disabled { background: #94a3b8; cursor: not-allowed; }

#rec-timer { font-weight: 600; color: #374151; margin-bottom: 0.25rem; }

#rec-hint { font-size: 0.85rem; color: #64748b; }

#wave-canvas { display: block; margin: 0.5rem auto; border-radius: 4px; }

/* Results grid */

.results–grid {

display: grid;

grid–template–columns: repeat(auto–fit, minmax(220px, 1fr));

gap: 1rem;

margin–top: 1.25rem;

}

.result–card {

background: white;

border: 1px solid #e2e8f0;

border–radius: 8px;

padding: 1rem;

}

.result–card h3 {

font–size: 0.75rem;

text–transform: uppercase;

letter–spacing: 0.06em;

color: #64748b;

margin–bottom: 0.6rem;

}

.label–item {

display: flex;

justify–content: space–between;

align–items: center;

padding: 0.25rem 0;

font–size: 0.875rem;

border–bottom: 1px solid #f1f5f9;

}

.label–score {

font–size: 0.8rem;

color: #64748b;

background: #f1f5f9;

padding: 0.1rem 0.4rem;

border–radius: 4px;

}

.caption–body {

font–size: 0.95rem;

line–height: 1.5;

font–style: italic;

color: #334155;

}

.transcript–body {

font–size: 0.95rem;

line–height: 1.6;

color: #334155;

white–space: pre–wrap;

}

.placeholder–text { color: #94a3b8; font-style: italic; font-size: 0.9rem; }

#global-status {

font–size: 0.85rem;

color: #64748b;

margin–bottom: 1rem;

}

@media (max–width: 500px) {

.results–grid { grid–template–columns: 1fr; }

}

<h1>Multimodal Media Analyzer</h1>

<p>Image classification, captioning, and speech transcription — all in your browser.</p>

<span class=“model-badge” id=“badge-cls”>Classifier: loading...</span>

<span class=“model-badge” id=“badge-cap”>Captioner: loading...</span>

<span class=“model-badge” id=“badge-asr”>Whisper: loading...</span>

</div>

</header>

<div id=“global-status”>Loading models in parallel — first run downloads ~400 MB total.</div>

<div class=“tab active” data–tab=“image”>🖼 Image Analysis</div>

<div class=“tab” data–tab=“speech”>🎙 Speech Transcription</div>

</div>

<!— Image panel —>

<p>Click or drag an image to analyze</p>

JPG, PNG, WebP, GIF supported

</p>

</div>

</div>

<!— Speech panel —>

<div id=“rec-hint”>Waiting for Whisper model...</div>

</div>

<!— Results – shown for both modes —>

<!— Image results (shown in image mode) —>

<h3>Classification</h3>

<p class=“placeholder-text”>No results yet.</p>

</div>

<h3>Caption</h3>

<p class=“placeholder-text”>No caption yet.</p>

</div>

<!— Speech results (shown in speech mode) —>

<h3>Transcription</h3>

<p class=“placeholder-text”>Record audio to see the transcription.</p>

</div>

import { pipeline }

from ‘https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2’;

// ── Pipeline references ───────────────────────────────────────────────

let classifier, captioner, transcriber;

let readyCount = 0;

// Update a model badge to “ready” state

function markReady(badgeId, label) {

const badge = document.getElementById(badgeId);

badge.textContent = `${label}: ready`;

badge.classList.add(‘ready’);

readyCount++;

if (readyCount === 3) {

globalStatus.textContent =

‘All models ready. Upload an image or record audio.’;

recBtn.disabled = false;

recHint.textContent = ‘Click to start recording.’;

}

// Load all three pipelines simultaneously

Promise.all([

pipeline(‘image-classification’, ‘Xenova/vit-base-patch16-224’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-cls’, ‘Classifier’)

}),

pipeline(‘image-to-text’, ‘Xenova/vit-gpt2-image-captioning’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-cap’, ‘Captioner’)

}),

pipeline(‘automatic-speech-recognition’, ‘Xenova/whisper-tiny.en’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-asr’, ‘Whisper’)

})

]).then(([cls, cap, asr]) => {

classifier = cls;

captioner = cap;

transcriber = asr;

}).catch(err => {

globalStatus.textContent = `Error loading models: ${err.message}`;

});

// ── UI references ─────────────────────────────────────────────────────

const globalStatus = document.getElementById(‘global-status’);

const resultsGrid = document.getElementById(‘results-grid’);

const recBtn = document.getElementById(‘rec-btn’);

const recHint = document.getElementById(‘rec-hint’);

const recTimer = document.getElementById(‘rec-timer’);

const waveCanvas = document.getElementById(‘wave-canvas’);

const waveCtx = waveCanvas.getContext(‘2d’);

// ── Image analysis ────────────────────────────────────────────────────

async function analyzeImage(dataUrl) {

if (!classifier || !captioner) {

globalStatus.textContent = ‘Models still loading. Please wait.’;

return;

}

globalStatus.textContent = ‘Running classification and captioning…’;

// Show image result cards, hide speech card

document.getElementById(‘card-cls’).style.display = ‘block’;

document.getElementById(‘card-cap’).style.display = ‘block’;

document.getElementById(‘card-asr’).style.display = ‘none’;

resultsGrid.style.display = ‘grid’;

document.getElementById(‘cls-content’).innerHTML =

‘<p class=”placeholder-text”>Classifying…</p>’;

document.getElementById(‘cap-content’).innerHTML =

‘<p class=”placeholder-text”>Generating caption…</p>’;

try {

// Run classification and captioning in parallel

const [classResults, captionResults] = await Promise.all([

classifier(dataUrl, { top_k: 4 }),

captioner(dataUrl, { max_new_tokens: 60 })

]);

// Render classification labels

document.getElementById(‘cls-content’).innerHTML =

classResults.map(({ label, score }) => `

<span>${label}</span>

<span class=“label-score”>${(score * 100).toFixed(1)}%</span>

</div>`).join(”);

// Render generated caption

document.getElementById(‘cap-content’).innerHTML =

`<p class=“caption-body”>“${captionResults[0]?.generated_text ?? ‘No caption.’}”</p>`;

globalStatus.textContent = ‘Analysis complete.’;

} catch (err) {

globalStatus.textContent = `Error: ${err.message}`;

}

// File upload handler for images

const imgDrop = document.getElementById(‘img-drop’);

const imgInput = document.getElementById(‘img-input’);

const imgPrev = document.getElementById(‘img-preview’);

function handleImageFile(file) {

if (!file?.type.startsWith(‘image/’)) return;

const reader = new FileReader();

reader.onload = e => {

imgPrev.src = e.target.result;

imgPrev.style.display = ‘block’;

analyzeImage(e.target.result);

};

reader.readAsDataURL(file);

}

imgDrop.addEventListener(‘click’, () => imgInput.click());

imgInput.addEventListener(‘change’, e => handleImageFile(e.target.files[0]));

imgDrop.addEventListener(‘dragover’, e => e.preventDefault());

imgDrop.addEventListener(‘drop’, e => {

e.preventDefault();

handleImageFile(e.dataTransfer.files[0]);

});

// ── Audio decoding helper ─────────────────────────────────────────────

async function decodeAudio(arrayBuffer) {

const audioCtx = new AudioContext({ sampleRate: 16000 });

const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);

return audioBuffer.getChannelData(0); // Mono Float32Array at 16kHz

}

// ── Speech transcription ──────────────────────────────────────────────

async function runTranscription(audioData) {

// Show speech result card, hide image cards

document.getElementById(‘card-cls’).style.display = ‘none’;

document.getElementById(‘card-cap’).style.display = ‘none’;

document.getElementById(‘card-asr’).style.display = ‘block’;

resultsGrid.style.display = ‘grid’;

document.getElementById(‘asr-content’).innerHTML =

‘<p class=”placeholder-text”>Transcribing…</p>’;

globalStatus.textContent = ‘Running Whisper transcription…’;

try {

const result = await transcriber(audioData, {

chunk_length_s: 30,

stride_length_s: 5

});

document.getElementById(‘asr-content’).innerHTML =

`<p class=“transcript-body”>${result.text.trim()}</p>`;

globalStatus.textContent = ‘Transcription complete.’;

} catch (err) {

globalStatus.textContent = `Error: ${err.message}`;

}

// ── Microphone recording ──────────────────────────────────────────────

let mediaRecorder, audioChunks = [], timerInterval, analyserNode, animId;

let secs = 0;

function drawWave() {

const buf = new Uint8Array(analyserNode.frequencyBinCount);

analyserNode.getByteTimeDomainData(buf);

waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.height);

waveCtx.beginPath();

waveCtx.strokeStyle = ‘#2563eb’;

waveCtx.lineWidth = 1.5;

buf.forEach((v, i) => {

const x = (i / buf.length) * waveCanvas.width;

const y = (v / 128.0) * (waveCanvas.height / 2);

i === 0 ? waveCtx.moveTo(x, y) : waveCtx.lineTo(x, y);

});

waveCtx.stroke();

animId = requestAnimationFrame(drawWave);

}

recBtn.addEventListener(‘click’, async () => {

if (mediaRecorder?.state === ‘recording’) {

mediaRecorder.stop();

recBtn.classList.remove(‘recording’);

recBtn.textContent = ‘🎙’;

clearInterval(timerInterval);

cancelAnimationFrame(animId);

waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.height);

recHint.textContent = ‘Processing…’;

} else {

try {

const stream = await navigator.mediaDevices.getUserMedia({ audio: true });

const actx = new AudioContext();

analyserNode = actx.createAnalyser();

actx.createMediaStreamSource(stream).connect(analyserNode);

analyserNode.fftSize = 256;

mediaRecorder = new MediaRecorder(stream);

audioChunks = [];

mediaRecorder.ondataavailable = e => e.data.size && audioChunks.push(e.data);

mediaRecorder.onstop = async () => {

const blob = new Blob(audioChunks, { type: ‘audio/webm’ });

const arrayBuffer = await blob.arrayBuffer();

const audioData = await decodeAudio(arrayBuffer);

stream.getTracks().forEach(t => t.stop());

await runTranscription(audioData);

recHint.textContent = ‘Click to record again.’;

};

mediaRecorder.start();

recBtn.classList.add(‘recording’);

recBtn.textContent = ‘⏹’;

secs = 0;

recTimer.textContent = ‘0:00’;

timerInterval = setInterval(() => {

secs++;

recTimer.textContent =

`${Math.floor(secs / 60)}:${String(secs % 60).padStart(2, ‘0’)}`;

}, 1000);

recHint.textContent = ‘Recording… click to stop.’;

drawWave();

} catch (err) {

recHint.textContent = `Mic error: ${err.message}`;

}

});

// ── Tab switching ─────────────────────────────────────────────────────

document.querySelectorAll(‘.tab’).forEach(tab => {

tab.addEventListener(‘click’, () => {

document.querySelectorAll(‘.tab, .panel’).forEach(el =>

el.classList.remove(‘active’));

tab.classList.add(‘active’);

document.getElementById(`panel–${tab.dataset.tab}`).classList.add(‘active’);

});

Source_link

Multimodal Browser AI with Transformers.js for Images and Speech

Q&A: What is agentic AI today, and what do we want it to be? | MIT News

Meta AI Releases Brain2Qwerty v2: A Non-Invasive MEG Brain-to-Text Pipeline Decoding Typed Sentences at 61% Word Accuracy

Related Posts

Q&A: What is agentic AI today, and what do we want it to be? | MIT News

Meta AI Releases Brain2Qwerty v2: A Non-Invasive MEG Brain-to-Text Pipeline Decoding Typed Sentences at 61% Word Accuracy

Multi-Label Text Classification with Scikit-LLM

Inaugural Music Technology Research Showcase celebrates work of new graduate program’s initial students | MIT News

OpenClaw Releases iOS and Android Companion Node Apps That Connect a Phone to a Self-Hosted AI Agent Gateway

Python Concepts Every AI Engineer Must Master

42 Pinterest stats that matter to marketers in 2026

POPULAR NEWS

Trump ends trade talks with Canada over a digital services tax

15 Trending Songs on TikTok in 2025 (+ How to Use Them)

Communication Effectiveness Skills For Business Leaders

App Development Cost in Singapore: Pricing Breakdown & Insights

Comparing the Top 7 Large Language Models LLMs/Systems for Coding in 2025

EDITOR'S PICK

Inside the Creative Artificial Intelligence (AI) Stack: Where Human Vision and Artificial Intelligence Meet to Design Future Fashion

Google announces new AI tools for mental health research and treatment

Canada invests in 23 Canadian creative companies to help them reach global success

RAG Without Vectors: How PageIndex Retrieves by Reasoning

About

Categories

Recent Posts

Multimodal Browser AI with Transformers.js for Images and Speech

READ ALSO

Related Posts

POPULAR NEWS

EDITOR'S PICK

About

Categories

Recent Posts