Why AI scraping beats traditional scraping
Traditional scrapers break when websites change their HTML. AI scrapers understand meaning, not markup. Tell Claude "extract all prices" and it works regardless of whether prices are in <span class="price"> or <div data-amount="...">.
The complete scraper
import httpx
from openai import OpenAI
from bs4 import BeautifulSoup
import json
client = OpenAI(
api_key="izzi-YOUR_KEY_HERE",
base_url="https://api.izziapi.com/v1"
)
def scrape_with_ai(url: str, instruction: str) -> dict:
"""Scrape any URL and extract structured data using AI."""
# Step 1: Fetch the page
response = httpx.get(url, headers={"User-Agent": "Mozilla/5.0"}, follow_redirects=True)
# Step 2: Clean HTML to reduce tokens
soup = BeautifulSoup(response.text, "html.parser")
for tag in soup(["script", "style", "nav", "footer", "header"]):
tag.decompose()
clean_text = soup.get_text(separator="\n", strip=True)[:8000]
# Step 3: Extract with AI
ai_response = client.chat.completions.create(
model="claude-sonnet-4-20250514",
messages=[{
"role": "user",
"content": f"""Extract the following from this web page and return as JSON:
{instruction}
Web page content:
{clean_text}"""
}],
max_tokens=2000
)
raw = ai_response.choices[0].message.content
raw = raw.strip().removeprefix("```json").removesuffix("```").strip()
return json.loads(raw)
# Example: scrape product data
result = scrape_with_ai(
url="https://example.com/products",
instruction="Extract product name, price in USD, rating, and availability for each product"
)
print(json.dumps(result, indent=2))Batch scraping multiple pages
import asyncio
async def batch_scrape(urls: list[str], instruction: str) -> list[dict]:
"""Scrape multiple URLs concurrently."""
tasks = [
asyncio.to_thread(scrape_with_ai, url, instruction)
for url in urls
]
return await asyncio.gather(*tasks, return_exceptions=True)
# Scrape 10 competitor pricing pages
urls = ["https://competitor1.com/pricing", "https://competitor2.com/pricing", ...]
results = asyncio.run(batch_scrape(urls, "Extract plan names and prices"))Cost optimization: use free models for simple scrapes
def smart_scrape(url: str, instruction: str, complexity: str = "simple") -> dict:
model = "qwen3-235b-a22b" if complexity == "simple" else "claude-sonnet-4-20250514"
# ... same logic, different model| Complexity | Model | Cost per page |
|---|---|---|
| Simple (prices, titles) | Qwen3 235B (free) | $0 |
| Complex (reviews, comparison) | Claude Sonnet 4 | ~$0.01 |
