Why streaming matters
Without streaming, users wait 3-8 seconds staring at a blank screen. With streaming, the first token appears in 200ms. That difference determines whether users stay or leave.
FastAPI streaming implementation
Python
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from openai import OpenAI
import json
app = FastAPI()
client = OpenAI(
api_key="izzi-YOUR_KEY_HERE",
base_url="https://api.izziapi.com/v1"
)
@app.post("/chat/stream")
async def stream_chat(request: dict):
message = request.get("message", "")
model = request.get("model", "claude-sonnet-4-20250514")
async def generate():
stream = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": message}],
stream=True
)
for chunk in stream:
content = chunk.choices[0].delta.content or ""
if content:
yield f"data: {json.dumps({'content': content})}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(
generate(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
}
)Frontend: consuming the stream
TypeScript
async function streamChat(message: string) {
const response = await fetch("/chat/stream", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ message }),
});
const reader = response.body!.getReader();
const decoder = new TextDecoder();
let fullResponse = "";
while (true) {
const { done, value } = await reader.read();
if (done) break;
const text = decoder.decode(value);
const lines = text.split("\n\n").filter(Boolean);
for (const line of lines) {
if (line === "data: [DONE]") return fullResponse;
if (line.startsWith("data: ")) {
const { content } = JSON.parse(line.slice(6));
fullResponse += content;
updateUI(fullResponse); // Update your UI here
}
}
}
return fullResponse;
}Performance comparison
| Metric | Non-streaming | Streaming (SSE) |
|---|---|---|
| Time to first token | 3-8 seconds | 200-500ms |
| Perceived latency | Very slow | Instant |
| User engagement | 40% drop-off | 5% drop-off |
| Token cost | Same | Same |
Error handling in streams
Python
async def generate_with_error_handling():
try:
stream = client.chat.completions.create(
model="claude-sonnet-4-20250514",
messages=messages,
stream=True,
timeout=30
)
for chunk in stream:
content = chunk.choices[0].delta.content or ""
if content:
yield f"data: {json.dumps({'content': content})}\n\n"
except Exception as e:
yield f"data: {json.dumps({'error': str(e)})}\n\n"
finally:
yield "data: [DONE]\n\n"