The Web Crawler for AI Agents and LLMs

Get web data for any AI project, from agentic workflows and RAG systems to data analysis. Spider offers the speed and scalability required for any project size.

100,000+

pages/sec

99.5%

success rate

Pay per use

no minimums

Try for free

No credit card required

spider.cloud

import requests, os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

json_data = {
  "url": "https://spider.cloud",
  "return_format": "markdown"
}

response = requests.post('https://api.spider.cloud/scrape',
  headers=headers, json=json_data)

print(response.json())

const response = await fetch('https://api.spider.cloud/scrape', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${process.env.SPIDER_API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    url: "https://spider.cloud",
    return_format: "markdown"
  })
});

console.log(await response.json());

import requests, os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

json_data = { "limit": 5, "url": "https://spider.cloud" }

response = requests.post('https://api.spider.cloud/crawl',
  headers=headers, json=json_data)

print(response.json())

const response = await fetch('https://api.spider.cloud/crawl', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${process.env.SPIDER_API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    limit: 5,
    url: "https://spider.cloud"
  })
});

console.log(await response.json());

import requests, os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

json_data = { "limit": 5, "url": "https://spider.cloud" }

response = requests.post('https://api.spider.cloud/screenshot',
  headers=headers, json=json_data)

# returns base64 encoded image
print(response.json())

const response = await fetch('https://api.spider.cloud/screenshot', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${process.env.SPIDER_API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    limit: 5,
    url: "https://spider.cloud"
  })
});

// returns base64 encoded image
console.log(await response.json());

import requests, os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

json_data = { "limit": 5, "url": "https://spider.cloud" }

response = requests.post('https://api.spider.cloud/links',
  headers=headers, json=json_data)

print(response.json())

const response = await fetch('https://api.spider.cloud/links', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${process.env.SPIDER_API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    limit: 5,
    url: "https://spider.cloud"
  })
});

console.log(await response.json());

import requests, os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

json_data = { "search": "top AI crawling tools" }

response = requests.post('https://api.spider.cloud/search',
  headers=headers, json=json_data)

print(response.json())

const response = await fetch('https://api.spider.cloud/search', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${process.env.SPIDER_API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    search: "top AI crawling tools"
  })
});

console.log(await response.json());

import requests, os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

json_data = {
  "url": "https://spider.cloud",
  "return_format": "markdown"
}

response = requests.post('https://api.spider.cloud/scrape',
  headers=headers, json=json_data)

print(response.json())

const response = await fetch('https://api.spider.cloud/scrape', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${process.env.SPIDER_API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    url: "https://spider.cloud",
    return_format: "markdown"
  })
});

console.log(await response.json());

import requests, os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

json_data = { "limit": 5, "url": "https://spider.cloud" }

response = requests.post('https://api.spider.cloud/crawl',
  headers=headers, json=json_data)

print(response.json())

const response = await fetch('https://api.spider.cloud/crawl', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${process.env.SPIDER_API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    limit: 5,
    url: "https://spider.cloud"
  })
});

console.log(await response.json());

import requests, os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

json_data = { "limit": 5, "url": "https://spider.cloud" }

response = requests.post('https://api.spider.cloud/screenshot',
  headers=headers, json=json_data)

# returns base64 encoded image
print(response.json())

const response = await fetch('https://api.spider.cloud/screenshot', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${process.env.SPIDER_API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    limit: 5,
    url: "https://spider.cloud"
  })
});

// returns base64 encoded image
console.log(await response.json());

import requests, os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

json_data = { "limit": 5, "url": "https://spider.cloud" }

response = requests.post('https://api.spider.cloud/links',
  headers=headers, json=json_data)

print(response.json())

const response = await fetch('https://api.spider.cloud/links', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${process.env.SPIDER_API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    limit: 5,
    url: "https://spider.cloud"
  })
});

console.log(await response.json());

import requests, os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

json_data = { "search": "top AI crawling tools" }

response = requests.post('https://api.spider.cloud/search',
  headers=headers, json=json_data)

print(response.json())

const response = await fetch('https://api.spider.cloud/search', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${process.env.SPIDER_API_KEY}`,
    'Content-Type': 'application/json'
  },
  body: JSON.stringify({
    search: "top AI crawling tools"
  })
});

console.log(await response.json());

Integrations with leading AI platforms

Powering AI at Web Scale

The fastest, most cost-effective web data infrastructure for the next generation of AI.

Pay Per Use

Billed to the fraction of a cent. No minimums, no subscriptions. Scale from 1 to 1 million pages seamlessly.

Unmatched Speed

Rust-powered concurrency crawls 20x faster than alternatives. Streaming results eliminate wait times.

Built-in Reliability

Auto proxy rotation, anti-bot handling, and headless browser rendering. Focus on building, not scraping.

Spider API Request Modes · Benchmarked tailwindcss.com · 06/16/2024

See framework benchmarks

Raw Speed

Sub-second responses on single pages, even with full browser rendering. High-quality output without the wait.

POST /scrape { "url": "https://example.com", "return_format": "markdown" }

▶ Rendering with headless browser...

▶ Content extracted: 4.2kb markdown

▶ Status: 200

> completed in 0.42s

Concurrent Streaming

Stream results concurrently without bandwidth limits. The more sites you crawl, the bigger your latency savings.

POST /crawl { "url": "https://spider.cloud", "limit": 100 }

▶ Connected — streaming 100 pages...

▶ Throughput: 850 pages/sec

▶ Latency: 12ms avg

> complete in 1.47s

Intelligent Infrastructure

Built on the Spider open-source project. The stack learns over time, optimizing country-level geo routing and filtering out resources that slow things down. Everything stays fast, cheap, and efficient.

SYS.INTELLIGENCE

Geo routing: optimized per target

Resource filtering: adaptive learning

Anti-bot bypass: continuously updated

Network stack: auto-tuned per domain

Success rate: 99.5%

First-class Integrations

Works with LangChain, LlamaIndex, CrewAI, AutoGen, Agno, FlowiseAI, Dify, and more. Drop Spider into any AI stack in minutes.

# Python — LangChain

from langchain_community.document_loaders import SpiderLoader

loader = SpiderLoader(url="https://example.com")

docs = loader.load()

# Also: LlamaIndex, CrewAI, AutoGen, Agno...

AI Data Extraction

Send a prompt describing what you need and get structured JSON back. No CSS selectors, no XPath, no parsing code.

POST /ai/crawl

{

"url": "https://books.toscrape.com",

"prompt": "Extract all book titles and prices"

}

▶ Crawling... done

▶ AI extracting with prompt...

[{ "title": "A Light in the Attic", "price": "£51.77" },

{ "title": "Tipping the Velvet", "price": "£53.74" }, ...]

Start Collecting Data Today

Our web crawler provides full elastic scaling concurrency, optimal formats, and AI scraping.

Performance Tuned

Spider is written in Rust and runs in full concurrency to achieve crawling thousands of pages in seconds.

Multiple Response Formats

Get clean formatted markdown, HTML, and text content for fine-tuning or training AI models.

HTTP Caching

Further boost speed by caching repeated web page crawls to minimize expenses while building.

Smart Mode

Dynamically switch to Chrome to render JavaScript when needed.

Search

Perform stable and accurate SERP request with a single API.

The Crawler for LLMs

Don't let crawling and scraping be the highest latency in your LLM & AI agent stack.

Collect data easily

Auto proxy rotations
Low latency responses
99.5% average success rate
Headless browsers
Markdown responses

The Fastest Web Crawler

Powered by spider-rs
100,000 pages/seconds
Unlimited concurrency
Simple consistent API
50,000 request per minute

Do more with AI

Browser scripting
Advanced data extraction
Streamlined data pipelines
Ideal for LLMs and AI Agents
Precise labeling content

Join the Community

Backed by a network of early advocates, contributors, and supporters.

GitHub discussions Discord chat

@iammerrick

Rust based crawler Spider is next level for crawling & scraping sites. So fast. Their cloud offering is also so easy to use. Good stuff. https://github.com/spider-rs/spider

@WilliamEspegren

Web crawler built in rust, currently the nr1 performance in the world with crazy resource management Aaaaaaand they have a cloud offer, that's wayyyy cheaper than any competitor Name a reason for me to use anything else? github.com/spider-rs/spid…

@gasa

@spider_rs is the best crawling tool i have used. I had a complicated project where i needed to paste url and get the website whole website data. Spider does it in an instant

@Ashpreet Bedi

@spider_rs is THE best crawler out there, give it a try

@Troyusrex

I found a new tool, Spider-rs, which scrapes significantly faster and handles more scenarios than the basic scraper I built did. Our use of Spider-rs and AWS infrastructure reduced the scraping time from four months to under a week.

@Dify.AI

🕷️ Spider @spider_rust can be used as a built-in tool in #Dify Workflow or as an LLM-callable tool in Agent. It allows fast and affordable web scraping and crawling when your AI applications need real-time web data for context.

Get AI-ready data with zero friction

Start crawling in under 30 seconds. No credit card required for new accounts to try out.

Try for free

Read the docs

Frequently Asked Questions

Everything you need to know about Spider.

What is Spider?

Spider is a leading web crawling tool designed for speed and cost-effectiveness, supporting various data formats including LLM-ready markdown.

How can I try Spider?

Purchase credits for our cloud system or test the Open-Source Spider engine to explore its capabilities.

What are the rate limits?

Every account can make up to 50,000 core API requests per second.

Can you crawl all pages?

Yes, Spider accurately crawls all necessary content without needing a sitemap ethically. We rate-limit individual URLs per minute to balance the load on a web server.

What formats can Spider convert web data into?

Spider outputs HTML, raw, text, and various markdown formats. It supports JSON, JSONL, CSV, and XML for API responses.

Does it respect robots.txt?

Yes, compliance with robots.txt is default, but you can disable this if necessary.