<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
  <channel>
    <title>Inference models</title>
    <link>https://lambda.ai/inference-models</link>
    <description>Lambda's catalog of model cards for the LLMs that matter. Search by model name to get architecture breakdowns, hardware requirements, deployment guides, and throughput benchmarks on NVIDIA GPUs.</description>
    <language>en</language>
    <pubDate>Sat, 06 Jun 2026 15:42:26 GMT</pubDate>
    <dc:date>2026-06-06T15:42:26Z</dc:date>
    <dc:language>en</dc:language>
    <item>
      <title>nvidia/nemotron-3-ultra</title>
      <link>https://lambda.ai/inference-models/nvidia/nemotron-3-ultra</link>
      <description>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/nvidia/nemotron-3-ultra" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-nemotron-3-ultra-on-lambda-1780760405077.png" alt="How to deploy Nemotron 3 Ultra on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;p&gt;All benchmarks use the single NVFP4 checkpoint (&lt;code&gt;nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4&lt;/code&gt;) on a decode-heavy 8K-input / 64K-output workload at 256 concurrent requests. &lt;strong&gt;Per-user generation throughput&lt;/strong&gt; is aggregate generation tok/s divided by the concurrency level — the rate each individual user sees their response stream back.&lt;/p&gt;</description>
      <content:encoded>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/nvidia/nemotron-3-ultra" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-nemotron-3-ultra-on-lambda-1780760405077.png" alt="How to deploy Nemotron 3 Ultra on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;p&gt;All benchmarks use the single NVFP4 checkpoint (&lt;code&gt;nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4&lt;/code&gt;) on a decode-heavy 8K-input / 64K-output workload at 256 concurrent requests. &lt;strong&gt;Per-user generation throughput&lt;/strong&gt; is aggregate generation tok/s divided by the concurrency level — the rate each individual user sees their response stream back.&lt;/p&gt;  
&lt;img src="https://track.hubspot.com/__ptq.gif?a=21998649&amp;amp;k=14&amp;amp;r=https%3A%2F%2Flambda.ai%2Finference-models%2Fnvidia%2Fnemotron-3-ultra&amp;amp;bu=https%253A%252F%252Flambda.ai%252Finference-models&amp;amp;bvt=rss" alt="" width="1" height="1" style="min-height:1px!important;width:1px!important;border-width:0!important;margin-top:0!important;margin-bottom:0!important;margin-right:0!important;margin-left:0!important;padding-top:0!important;padding-bottom:0!important;padding-right:0!important;padding-left:0!important; "&gt;</content:encoded>
      <pubDate>Sat, 06 Jun 2026 15:42:26 GMT</pubDate>
      <guid>https://lambda.ai/inference-models/nvidia/nemotron-3-ultra</guid>
      <dc:date>2026-06-06T15:42:26Z</dc:date>
      <dc:creator>Lambda</dc:creator>
    </item>
    <item>
      <title>LiquidAI/LFM2.5-8B-A1B</title>
      <link>https://lambda.ai/inference-models/liquidai/lfm2.5-8b-a1b</link>
      <description>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/liquidai/lfm2.5-8b-a1b" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-lfm2-5-8b-a1b-on-lambda-1780497459417.png" alt="How to deploy LFM2.5-8B-A1B on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;div class="tab-container"&gt; 
 &lt;div class="tab-buttons"&gt; SGLang vLLM 
 &lt;/div&gt; 
 &lt;div class="tab-content active"&gt; 
  &lt;div class="table-wrapper"&gt; 
   &lt;table&gt; 
    &lt;tbody&gt; 
     &lt;tr&gt; 
      &lt;td&gt;Hardware&lt;/td&gt; 
      &lt;td&gt;Gen. throughput&lt;/td&gt; 
      &lt;td&gt;Per-user gen&lt;/td&gt; 
      &lt;td&gt;Total throughput&lt;/td&gt; 
      &lt;td&gt;TTFT (mean)&lt;/td&gt; 
      &lt;td&gt;ITL (mean)&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA B200 GPU&lt;/td&gt; 
      &lt;td&gt;6,098 tok/s&lt;/td&gt; 
      &lt;td&gt;206 tok/s&lt;/td&gt; 
      &lt;td&gt;30,489 tok/s&lt;/td&gt; 
      &lt;td&gt;792 ms&lt;/td&gt; 
      &lt;td&gt;4.9 ms&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA H100 GPU&lt;/td&gt; 
      &lt;td&gt;3,714 tok/s&lt;/td&gt; 
      &lt;td&gt;125 tok/s&lt;/td&gt; 
      &lt;td&gt;18,572 tok/s&lt;/td&gt; 
      &lt;td&gt;1,248 ms&lt;/td&gt; 
      &lt;td&gt;8.0 ms&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA A100 GPU&lt;/td&gt; 
      &lt;td&gt;1,950 tok/s&lt;/td&gt; 
      &lt;td&gt;68 tok/s&lt;/td&gt; 
      &lt;td&gt;9,751 tok/s&lt;/td&gt; 
      &lt;td&gt;3,594 ms&lt;/td&gt; 
      &lt;td&gt;14.7 ms&lt;/td&gt; 
     &lt;/tr&gt; 
    &lt;/tbody&gt; 
   &lt;/table&gt; 
  &lt;/div&gt; 
 &lt;/div&gt; 
 &lt;div class="tab-content"&gt; 
  &lt;div class="table-wrapper"&gt; 
   &lt;table&gt; 
    &lt;tbody&gt; 
     &lt;tr&gt; 
      &lt;td&gt;Hardware&lt;/td&gt; 
      &lt;td&gt;Gen. throughput&lt;/td&gt; 
      &lt;td&gt;Per-user gen&lt;/td&gt; 
      &lt;td&gt;Total throughput&lt;/td&gt; 
      &lt;td&gt;TTFT (mean)&lt;/td&gt; 
      &lt;td&gt;ITL (mean)&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA B200 GPU&lt;/td&gt; 
      &lt;td&gt;7,253 tok/s&lt;/td&gt; 
      &lt;td&gt;238 tok/s&lt;/td&gt; 
      &lt;td&gt;36,267 tok/s&lt;/td&gt; 
      &lt;td&gt;433 ms&lt;/td&gt; 
      &lt;td&gt;4.5 ms&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA H100 GPU&lt;/td&gt; 
      &lt;td&gt;3,787 tok/s&lt;/td&gt; 
      &lt;td&gt;123 tok/s&lt;/td&gt; 
      &lt;td&gt;18,937 tok/s&lt;/td&gt; 
      &lt;td&gt;568 ms&lt;/td&gt; 
      &lt;td&gt;8.2 ms&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA A100 GPU&lt;/td&gt; 
      &lt;td&gt;1,971 tok/s&lt;/td&gt; 
      &lt;td&gt;64 tok/s&lt;/td&gt; 
      &lt;td&gt;9,853 tok/s&lt;/td&gt; 
      &lt;td&gt;962 ms&lt;/td&gt; 
      &lt;td&gt;15.7 ms&lt;/td&gt; 
     &lt;/tr&gt; 
    &lt;/tbody&gt; 
   &lt;/table&gt; 
  &lt;/div&gt; 
 &lt;/div&gt; 
&lt;/div&gt;</description>
      <content:encoded>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/liquidai/lfm2.5-8b-a1b" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-lfm2-5-8b-a1b-on-lambda-1780497459417.png" alt="How to deploy LFM2.5-8B-A1B on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;div class="tab-container"&gt; 
 &lt;div class="tab-buttons"&gt; SGLang vLLM 
 &lt;/div&gt; 
 &lt;div class="tab-content active"&gt; 
  &lt;div class="table-wrapper"&gt; 
   &lt;table&gt; 
    &lt;tbody&gt; 
     &lt;tr&gt; 
      &lt;td&gt;Hardware&lt;/td&gt; 
      &lt;td&gt;Gen. throughput&lt;/td&gt; 
      &lt;td&gt;Per-user gen&lt;/td&gt; 
      &lt;td&gt;Total throughput&lt;/td&gt; 
      &lt;td&gt;TTFT (mean)&lt;/td&gt; 
      &lt;td&gt;ITL (mean)&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA B200 GPU&lt;/td&gt; 
      &lt;td&gt;6,098 tok/s&lt;/td&gt; 
      &lt;td&gt;206 tok/s&lt;/td&gt; 
      &lt;td&gt;30,489 tok/s&lt;/td&gt; 
      &lt;td&gt;792 ms&lt;/td&gt; 
      &lt;td&gt;4.9 ms&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA H100 GPU&lt;/td&gt; 
      &lt;td&gt;3,714 tok/s&lt;/td&gt; 
      &lt;td&gt;125 tok/s&lt;/td&gt; 
      &lt;td&gt;18,572 tok/s&lt;/td&gt; 
      &lt;td&gt;1,248 ms&lt;/td&gt; 
      &lt;td&gt;8.0 ms&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA A100 GPU&lt;/td&gt; 
      &lt;td&gt;1,950 tok/s&lt;/td&gt; 
      &lt;td&gt;68 tok/s&lt;/td&gt; 
      &lt;td&gt;9,751 tok/s&lt;/td&gt; 
      &lt;td&gt;3,594 ms&lt;/td&gt; 
      &lt;td&gt;14.7 ms&lt;/td&gt; 
     &lt;/tr&gt; 
    &lt;/tbody&gt; 
   &lt;/table&gt; 
  &lt;/div&gt; 
 &lt;/div&gt; 
 &lt;div class="tab-content"&gt; 
  &lt;div class="table-wrapper"&gt; 
   &lt;table&gt; 
    &lt;tbody&gt; 
     &lt;tr&gt; 
      &lt;td&gt;Hardware&lt;/td&gt; 
      &lt;td&gt;Gen. throughput&lt;/td&gt; 
      &lt;td&gt;Per-user gen&lt;/td&gt; 
      &lt;td&gt;Total throughput&lt;/td&gt; 
      &lt;td&gt;TTFT (mean)&lt;/td&gt; 
      &lt;td&gt;ITL (mean)&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA B200 GPU&lt;/td&gt; 
      &lt;td&gt;7,253 tok/s&lt;/td&gt; 
      &lt;td&gt;238 tok/s&lt;/td&gt; 
      &lt;td&gt;36,267 tok/s&lt;/td&gt; 
      &lt;td&gt;433 ms&lt;/td&gt; 
      &lt;td&gt;4.5 ms&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA H100 GPU&lt;/td&gt; 
      &lt;td&gt;3,787 tok/s&lt;/td&gt; 
      &lt;td&gt;123 tok/s&lt;/td&gt; 
      &lt;td&gt;18,937 tok/s&lt;/td&gt; 
      &lt;td&gt;568 ms&lt;/td&gt; 
      &lt;td&gt;8.2 ms&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA A100 GPU&lt;/td&gt; 
      &lt;td&gt;1,971 tok/s&lt;/td&gt; 
      &lt;td&gt;64 tok/s&lt;/td&gt; 
      &lt;td&gt;9,853 tok/s&lt;/td&gt; 
      &lt;td&gt;962 ms&lt;/td&gt; 
      &lt;td&gt;15.7 ms&lt;/td&gt; 
     &lt;/tr&gt; 
    &lt;/tbody&gt; 
   &lt;/table&gt; 
  &lt;/div&gt; 
 &lt;/div&gt; 
&lt;/div&gt;  
&lt;img src="https://track.hubspot.com/__ptq.gif?a=21998649&amp;amp;k=14&amp;amp;r=https%3A%2F%2Flambda.ai%2Finference-models%2Fliquidai%2Flfm2.5-8b-a1b&amp;amp;bu=https%253A%252F%252Flambda.ai%252Finference-models&amp;amp;bvt=rss" alt="" width="1" height="1" style="min-height:1px!important;width:1px!important;border-width:0!important;margin-top:0!important;margin-bottom:0!important;margin-right:0!important;margin-left:0!important;padding-top:0!important;padding-bottom:0!important;padding-right:0!important;padding-left:0!important; "&gt;</content:encoded>
      <pubDate>Wed, 03 Jun 2026 14:44:31 GMT</pubDate>
      <guid>https://lambda.ai/inference-models/liquidai/lfm2.5-8b-a1b</guid>
      <dc:date>2026-06-03T14:44:31Z</dc:date>
      <dc:creator>Lambda</dc:creator>
    </item>
    <item>
      <title>HuggingFaceBio/Carbon-3B</title>
      <link>https://lambda.ai/inference-models/huggingfacebio/carbon-3b</link>
      <description>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/huggingfacebio/carbon-3b" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-carbon-on-lambda-1780497456154.png" alt="How to deploy Carbon on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: nucleotide throughput&lt;/h2&gt; 
&lt;p&gt;Both Carbon-500M and Carbon-3B run on a single &lt;strong&gt;NVIDIA A10 GPU&lt;/strong&gt;, served with SGLang or vLLM. Because Carbon tokenizes DNA as non-overlapping 6-mers, each token carries roughly 6 base pairs, so the token rates below correspond to roughly 6× as many base pairs per second.&lt;/p&gt;</description>
      <content:encoded>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/huggingfacebio/carbon-3b" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-carbon-on-lambda-1780497456154.png" alt="How to deploy Carbon on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: nucleotide throughput&lt;/h2&gt; 
&lt;p&gt;Both Carbon-500M and Carbon-3B run on a single &lt;strong&gt;NVIDIA A10 GPU&lt;/strong&gt;, served with SGLang or vLLM. Because Carbon tokenizes DNA as non-overlapping 6-mers, each token carries roughly 6 base pairs, so the token rates below correspond to roughly 6× as many base pairs per second.&lt;/p&gt;  
&lt;img src="https://track.hubspot.com/__ptq.gif?a=21998649&amp;amp;k=14&amp;amp;r=https%3A%2F%2Flambda.ai%2Finference-models%2Fhuggingfacebio%2Fcarbon-3b&amp;amp;bu=https%253A%252F%252Flambda.ai%252Finference-models&amp;amp;bvt=rss" alt="" width="1" height="1" style="min-height:1px!important;width:1px!important;border-width:0!important;margin-top:0!important;margin-bottom:0!important;margin-right:0!important;margin-left:0!important;padding-top:0!important;padding-bottom:0!important;padding-right:0!important;padding-left:0!important; "&gt;</content:encoded>
      <pubDate>Wed, 03 Jun 2026 14:44:28 GMT</pubDate>
      <guid>https://lambda.ai/inference-models/huggingfacebio/carbon-3b</guid>
      <dc:date>2026-06-03T14:44:28Z</dc:date>
      <dc:creator>Lambda</dc:creator>
    </item>
    <item>
      <title>stepfun-ai/Step-3.7-Flash</title>
      <link>https://lambda.ai/inference-models/stepfun-ai/step-3.7-flash</link>
      <description>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/stepfun-ai/step-3.7-flash" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-step-3-7-flash-on-lambda-1780497452159.png" alt="How to deploy Step 3.7 Flash on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;p&gt;vLLM with MTP speculative decoding, workload 8192 in / 2048 out tokens at 32 concurrent requests.&lt;/p&gt;</description>
      <content:encoded>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/stepfun-ai/step-3.7-flash" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-step-3-7-flash-on-lambda-1780497452159.png" alt="How to deploy Step 3.7 Flash on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;p&gt;vLLM with MTP speculative decoding, workload 8192 in / 2048 out tokens at 32 concurrent requests.&lt;/p&gt;  
&lt;img src="https://track.hubspot.com/__ptq.gif?a=21998649&amp;amp;k=14&amp;amp;r=https%3A%2F%2Flambda.ai%2Finference-models%2Fstepfun-ai%2Fstep-3.7-flash&amp;amp;bu=https%253A%252F%252Flambda.ai%252Finference-models&amp;amp;bvt=rss" alt="" width="1" height="1" style="min-height:1px!important;width:1px!important;border-width:0!important;margin-top:0!important;margin-bottom:0!important;margin-right:0!important;margin-left:0!important;padding-top:0!important;padding-bottom:0!important;padding-right:0!important;padding-left:0!important; "&gt;</content:encoded>
      <pubDate>Wed, 03 Jun 2026 14:44:24 GMT</pubDate>
      <guid>https://lambda.ai/inference-models/stepfun-ai/step-3.7-flash</guid>
      <dc:date>2026-06-03T14:44:24Z</dc:date>
      <dc:creator>Lambda</dc:creator>
    </item>
    <item>
      <title>moonshotai/Kimi-K2.6</title>
      <link>https://lambda.ai/inference-models/moonshotai/kimi-k2.6</link>
      <description>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/moonshotai/kimi-k2.6" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-kimi-k2-6-on-lambda-1777557525448.png" alt="How to deploy Kimi-K2.6 on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;h3&gt;vLLM&lt;/h3&gt; 
&lt;div class="table-wrapper"&gt; 
 &lt;table&gt; 
  &lt;thead&gt; 
   &lt;tr&gt; 
    &lt;th&gt;Hardware&lt;/th&gt; 
    &lt;th&gt;Gen. throughput&lt;/th&gt; 
    &lt;th&gt;Per-user gen&lt;/th&gt; 
    &lt;th&gt;Total throughput&lt;/th&gt; 
    &lt;th&gt;TTFT (mean)&lt;/th&gt; 
    &lt;th&gt;ITL (mean)&lt;/th&gt; 
   &lt;/tr&gt; 
  &lt;/thead&gt; 
  &lt;tbody&gt; 
   &lt;tr&gt; 
    &lt;td&gt;NVIDIA HGX B200&lt;/td&gt; 
    &lt;td&gt;1408 tok/s&lt;/td&gt; 
    &lt;td&gt;44 tok/s&lt;/td&gt; 
    &lt;td&gt;7046 tok/s&lt;/td&gt; 
    &lt;td&gt;2264 ms&lt;/td&gt; 
    &lt;td&gt;44.5 ms&lt;/td&gt; 
   &lt;/tr&gt; 
  &lt;/tbody&gt; 
 &lt;/table&gt; 
&lt;/div&gt;</description>
      <content:encoded>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/moonshotai/kimi-k2.6" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-kimi-k2-6-on-lambda-1777557525448.png" alt="How to deploy Kimi-K2.6 on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;h3&gt;vLLM&lt;/h3&gt; 
&lt;div class="table-wrapper"&gt; 
 &lt;table&gt; 
  &lt;thead&gt; 
   &lt;tr&gt; 
    &lt;th&gt;Hardware&lt;/th&gt; 
    &lt;th&gt;Gen. throughput&lt;/th&gt; 
    &lt;th&gt;Per-user gen&lt;/th&gt; 
    &lt;th&gt;Total throughput&lt;/th&gt; 
    &lt;th&gt;TTFT (mean)&lt;/th&gt; 
    &lt;th&gt;ITL (mean)&lt;/th&gt; 
   &lt;/tr&gt; 
  &lt;/thead&gt; 
  &lt;tbody&gt; 
   &lt;tr&gt; 
    &lt;td&gt;NVIDIA HGX B200&lt;/td&gt; 
    &lt;td&gt;1408 tok/s&lt;/td&gt; 
    &lt;td&gt;44 tok/s&lt;/td&gt; 
    &lt;td&gt;7046 tok/s&lt;/td&gt; 
    &lt;td&gt;2264 ms&lt;/td&gt; 
    &lt;td&gt;44.5 ms&lt;/td&gt; 
   &lt;/tr&gt; 
  &lt;/tbody&gt; 
 &lt;/table&gt; 
&lt;/div&gt;  
&lt;img src="https://track.hubspot.com/__ptq.gif?a=21998649&amp;amp;k=14&amp;amp;r=https%3A%2F%2Flambda.ai%2Finference-models%2Fmoonshotai%2Fkimi-k2.6&amp;amp;bu=https%253A%252F%252Flambda.ai%252Finference-models&amp;amp;bvt=rss" alt="" width="1" height="1" style="min-height:1px!important;width:1px!important;border-width:0!important;margin-top:0!important;margin-bottom:0!important;margin-right:0!important;margin-left:0!important;padding-top:0!important;padding-bottom:0!important;padding-right:0!important;padding-left:0!important; "&gt;</content:encoded>
      <pubDate>Thu, 30 Apr 2026 14:02:36 GMT</pubDate>
      <guid>https://lambda.ai/inference-models/moonshotai/kimi-k2.6</guid>
      <dc:date>2026-04-30T14:02:36Z</dc:date>
      <dc:creator>Lambda</dc:creator>
    </item>
    <item>
      <title>deepseek-ai/DeepSeek-V4-Pro</title>
      <link>https://lambda.ai/inference-models/deepseek-ai/deepseek-v4-pro</link>
      <description>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/deepseek-ai/deepseek-v4-pro" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-deepseek-v4-pro-on-lambda-1777394557120.png" alt="How to deploy DeepSeek-V4-Pro on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;h3&gt;vLLM&lt;/h3&gt; 
&lt;div class="table-wrapper"&gt; 
 &lt;table&gt; 
  &lt;tbody&gt; 
   &lt;tr&gt; 
    &lt;td&gt;Hardware&lt;/td&gt; 
    &lt;td&gt;Gen. throughput&lt;/td&gt; 
    &lt;td&gt;Per-user gen&lt;/td&gt; 
    &lt;td&gt;Total throughput&lt;/td&gt; 
    &lt;td&gt;TTFT (mean)&lt;/td&gt; 
    &lt;td&gt;ITL (mean)&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;NVIDIA HGX B200&lt;/td&gt; 
    &lt;td&gt;911.92 tok/s&lt;/td&gt; 
    &lt;td&gt;28.50 tok/s&lt;/td&gt; 
    &lt;td&gt;4,561.38 tok/s&lt;/td&gt; 
    &lt;td&gt;1,186.15 ms&lt;/td&gt; 
    &lt;td&gt;55.79 ms&lt;/td&gt; 
   &lt;/tr&gt; 
  &lt;/tbody&gt; 
 &lt;/table&gt; 
&lt;/div&gt;</description>
      <content:encoded>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/deepseek-ai/deepseek-v4-pro" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-deepseek-v4-pro-on-lambda-1777394557120.png" alt="How to deploy DeepSeek-V4-Pro on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;h3&gt;vLLM&lt;/h3&gt; 
&lt;div class="table-wrapper"&gt; 
 &lt;table&gt; 
  &lt;tbody&gt; 
   &lt;tr&gt; 
    &lt;td&gt;Hardware&lt;/td&gt; 
    &lt;td&gt;Gen. throughput&lt;/td&gt; 
    &lt;td&gt;Per-user gen&lt;/td&gt; 
    &lt;td&gt;Total throughput&lt;/td&gt; 
    &lt;td&gt;TTFT (mean)&lt;/td&gt; 
    &lt;td&gt;ITL (mean)&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;NVIDIA HGX B200&lt;/td&gt; 
    &lt;td&gt;911.92 tok/s&lt;/td&gt; 
    &lt;td&gt;28.50 tok/s&lt;/td&gt; 
    &lt;td&gt;4,561.38 tok/s&lt;/td&gt; 
    &lt;td&gt;1,186.15 ms&lt;/td&gt; 
    &lt;td&gt;55.79 ms&lt;/td&gt; 
   &lt;/tr&gt; 
  &lt;/tbody&gt; 
 &lt;/table&gt; 
&lt;/div&gt;  
&lt;img src="https://track.hubspot.com/__ptq.gif?a=21998649&amp;amp;k=14&amp;amp;r=https%3A%2F%2Flambda.ai%2Finference-models%2Fdeepseek-ai%2Fdeepseek-v4-pro&amp;amp;bu=https%253A%252F%252Flambda.ai%252Finference-models&amp;amp;bvt=rss" alt="" width="1" height="1" style="min-height:1px!important;width:1px!important;border-width:0!important;margin-top:0!important;margin-bottom:0!important;margin-right:0!important;margin-left:0!important;padding-top:0!important;padding-bottom:0!important;padding-right:0!important;padding-left:0!important; "&gt;</content:encoded>
      <pubDate>Tue, 28 Apr 2026 16:51:18 GMT</pubDate>
      <guid>https://lambda.ai/inference-models/deepseek-ai/deepseek-v4-pro</guid>
      <dc:date>2026-04-28T16:51:18Z</dc:date>
      <dc:creator>Lambda</dc:creator>
    </item>
    <item>
      <title>deepseek-ai/DeepSeek-V4-Flash</title>
      <link>https://lambda.ai/inference-models/deepseek-ai/deepseek-v4-flash</link>
      <description>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/deepseek-ai/deepseek-v4-flash" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-deepseek-v4-flash-on-lambda-1777322325030.png" alt="How to deploy DeepSeek-V4-Flash on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;div class="tab-container"&gt; 
 &lt;div class="tab-buttons"&gt; SGLang vLLM 
 &lt;/div&gt; 
 &lt;div class="tab-content active"&gt; 
  &lt;div class="table-wrapper"&gt; 
   &lt;table&gt; 
    &lt;tbody&gt; 
     &lt;tr&gt; 
      &lt;td&gt;Hardware&lt;/td&gt; 
      &lt;td&gt;Gen. throughput&lt;/td&gt; 
      &lt;td&gt;Per-user gen&lt;/td&gt; 
      &lt;td&gt;Total throughput&lt;/td&gt; 
      &lt;td&gt;TTFT (mean)&lt;/td&gt; 
      &lt;td&gt;ITL (mean)&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;NVIDIA HGX B200 (native FP4+FP8 build)&lt;/td&gt; 
      &lt;td&gt;1,222 tok/s&lt;/td&gt; 
      &lt;td&gt;38 tok/s&lt;/td&gt; 
      &lt;td&gt;11,000 tok/s&lt;/td&gt; 
      &lt;td&gt;1,701 ms&lt;/td&gt; 
      &lt;td&gt;66 ms&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;NVIDIA HGX H100 (FP8-quantized build)&lt;/td&gt; 
      &lt;td&gt;1,262 tok/s&lt;/td&gt; 
      &lt;td&gt;39 tok/s&lt;/td&gt; 
      &lt;td&gt;11,361 tok/s&lt;/td&gt; 
      &lt;td&gt;2,463 ms&lt;/td&gt; 
      &lt;td&gt;60 ms&lt;/td&gt; 
     &lt;/tr&gt; 
    &lt;/tbody&gt; 
   &lt;/table&gt; 
  &lt;/div&gt; 
 &lt;/div&gt; 
 &lt;div class="tab-content"&gt; 
  &lt;div class="table-wrapper"&gt; 
   &lt;table&gt; 
    &lt;tbody&gt; 
     &lt;tr&gt; 
      &lt;td&gt;Hardware&lt;/td&gt; 
      &lt;td&gt;Gen. throughput&lt;/td&gt; 
      &lt;td&gt;Per-user gen&lt;/td&gt; 
      &lt;td&gt;Total throughput&lt;/td&gt; 
      &lt;td&gt;TTFT (mean)&lt;/td&gt; 
      &lt;td&gt;ITL (mean)&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;NVIDIA HGX B200 (native FP4+FP8 build)&lt;/td&gt; 
      &lt;td&gt;1,469 tok/s&lt;/td&gt; 
      &lt;td&gt;46 tok/s&lt;/td&gt; 
      &lt;td&gt;13,217 tok/s&lt;/td&gt; 
      &lt;td&gt;1,452 ms&lt;/td&gt; 
      &lt;td&gt;20 ms&lt;/td&gt; 
     &lt;/tr&gt; 
    &lt;/tbody&gt; 
   &lt;/table&gt; 
  &lt;/div&gt; 
 &lt;/div&gt; 
&lt;/div&gt;</description>
      <content:encoded>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/deepseek-ai/deepseek-v4-flash" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-deepseek-v4-flash-on-lambda-1777322325030.png" alt="How to deploy DeepSeek-V4-Flash on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;div class="tab-container"&gt; 
 &lt;div class="tab-buttons"&gt; SGLang vLLM 
 &lt;/div&gt; 
 &lt;div class="tab-content active"&gt; 
  &lt;div class="table-wrapper"&gt; 
   &lt;table&gt; 
    &lt;tbody&gt; 
     &lt;tr&gt; 
      &lt;td&gt;Hardware&lt;/td&gt; 
      &lt;td&gt;Gen. throughput&lt;/td&gt; 
      &lt;td&gt;Per-user gen&lt;/td&gt; 
      &lt;td&gt;Total throughput&lt;/td&gt; 
      &lt;td&gt;TTFT (mean)&lt;/td&gt; 
      &lt;td&gt;ITL (mean)&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;NVIDIA HGX B200 (native FP4+FP8 build)&lt;/td&gt; 
      &lt;td&gt;1,222 tok/s&lt;/td&gt; 
      &lt;td&gt;38 tok/s&lt;/td&gt; 
      &lt;td&gt;11,000 tok/s&lt;/td&gt; 
      &lt;td&gt;1,701 ms&lt;/td&gt; 
      &lt;td&gt;66 ms&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;NVIDIA HGX H100 (FP8-quantized build)&lt;/td&gt; 
      &lt;td&gt;1,262 tok/s&lt;/td&gt; 
      &lt;td&gt;39 tok/s&lt;/td&gt; 
      &lt;td&gt;11,361 tok/s&lt;/td&gt; 
      &lt;td&gt;2,463 ms&lt;/td&gt; 
      &lt;td&gt;60 ms&lt;/td&gt; 
     &lt;/tr&gt; 
    &lt;/tbody&gt; 
   &lt;/table&gt; 
  &lt;/div&gt; 
 &lt;/div&gt; 
 &lt;div class="tab-content"&gt; 
  &lt;div class="table-wrapper"&gt; 
   &lt;table&gt; 
    &lt;tbody&gt; 
     &lt;tr&gt; 
      &lt;td&gt;Hardware&lt;/td&gt; 
      &lt;td&gt;Gen. throughput&lt;/td&gt; 
      &lt;td&gt;Per-user gen&lt;/td&gt; 
      &lt;td&gt;Total throughput&lt;/td&gt; 
      &lt;td&gt;TTFT (mean)&lt;/td&gt; 
      &lt;td&gt;ITL (mean)&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;NVIDIA HGX B200 (native FP4+FP8 build)&lt;/td&gt; 
      &lt;td&gt;1,469 tok/s&lt;/td&gt; 
      &lt;td&gt;46 tok/s&lt;/td&gt; 
      &lt;td&gt;13,217 tok/s&lt;/td&gt; 
      &lt;td&gt;1,452 ms&lt;/td&gt; 
      &lt;td&gt;20 ms&lt;/td&gt; 
     &lt;/tr&gt; 
    &lt;/tbody&gt; 
   &lt;/table&gt; 
  &lt;/div&gt; 
 &lt;/div&gt; 
&lt;/div&gt;  
&lt;img src="https://track.hubspot.com/__ptq.gif?a=21998649&amp;amp;k=14&amp;amp;r=https%3A%2F%2Flambda.ai%2Finference-models%2Fdeepseek-ai%2Fdeepseek-v4-flash&amp;amp;bu=https%253A%252F%252Flambda.ai%252Finference-models&amp;amp;bvt=rss" alt="" width="1" height="1" style="min-height:1px!important;width:1px!important;border-width:0!important;margin-top:0!important;margin-bottom:0!important;margin-right:0!important;margin-left:0!important;padding-top:0!important;padding-bottom:0!important;padding-right:0!important;padding-left:0!important; "&gt;</content:encoded>
      <pubDate>Mon, 27 Apr 2026 21:16:57 GMT</pubDate>
      <guid>https://lambda.ai/inference-models/deepseek-ai/deepseek-v4-flash</guid>
      <dc:date>2026-04-27T21:16:57Z</dc:date>
      <dc:creator>Lambda</dc:creator>
    </item>
    <item>
      <title>zai-org/GLM-5.1</title>
      <link>https://lambda.ai/inference-models/zai-org/glm-5.1</link>
      <description>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/zai-org/glm-5.1" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-glm-5-1-on-lambda-1775573401310.png" alt="How to deploy GLM-5.1 on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;div class="tab-container"&gt; 
 &lt;div class="tab-buttons"&gt; SGLang vLLM 
 &lt;/div&gt; 
 &lt;div class="tab-content active"&gt; 
  &lt;div class="table-wrapper"&gt; 
   &lt;table&gt; 
    &lt;tbody&gt; 
     &lt;tr&gt; 
      &lt;td&gt;Hardware&lt;/td&gt; 
      &lt;td&gt;Gen. throughput&lt;/td&gt; 
      &lt;td&gt;Per-user gen.&lt;/td&gt; 
      &lt;td&gt;Total throughput&lt;/td&gt; 
      &lt;td&gt;TTFT&lt;/td&gt; 
      &lt;td&gt;ITL&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA HGX B200&lt;/td&gt; 
      &lt;td&gt;1,345 tok/s&lt;/td&gt; 
      &lt;td&gt;42.0 tok/s/user&lt;/td&gt; 
      &lt;td&gt;6,727 tok/s&lt;/td&gt; 
      &lt;td&gt;1,073ms&lt;/td&gt; 
      &lt;td&gt;59ms&lt;/td&gt; 
     &lt;/tr&gt; 
    &lt;/tbody&gt; 
   &lt;/table&gt; 
  &lt;/div&gt; 
 &lt;/div&gt; 
 &lt;div class="tab-content"&gt; 
  &lt;div class="table-wrapper"&gt; 
   &lt;table&gt; 
    &lt;tbody&gt; 
     &lt;tr&gt; 
      &lt;td&gt;Hardware&lt;/td&gt; 
      &lt;td&gt;Gen. throughput&lt;/td&gt; 
      &lt;td&gt;Per-user gen.&lt;/td&gt; 
      &lt;td&gt;Total throughput&lt;/td&gt; 
      &lt;td&gt;TTFT&lt;/td&gt; 
      &lt;td&gt;ITL&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA HGX B200&lt;/td&gt; 
      &lt;td&gt;1,265 tok/s&lt;/td&gt; 
      &lt;td&gt;39.5 tok/s/user&lt;/td&gt; 
      &lt;td&gt;6,327 tok/s&lt;/td&gt; 
      &lt;td&gt;1,317ms&lt;/td&gt; 
      &lt;td&gt;58ms&lt;/td&gt; 
     &lt;/tr&gt; 
    &lt;/tbody&gt; 
   &lt;/table&gt; 
  &lt;/div&gt; 
 &lt;/div&gt; 
&lt;/div&gt;</description>
      <content:encoded>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/zai-org/glm-5.1" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-glm-5-1-on-lambda-1775573401310.png" alt="How to deploy GLM-5.1 on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;div class="tab-container"&gt; 
 &lt;div class="tab-buttons"&gt; SGLang vLLM 
 &lt;/div&gt; 
 &lt;div class="tab-content active"&gt; 
  &lt;div class="table-wrapper"&gt; 
   &lt;table&gt; 
    &lt;tbody&gt; 
     &lt;tr&gt; 
      &lt;td&gt;Hardware&lt;/td&gt; 
      &lt;td&gt;Gen. throughput&lt;/td&gt; 
      &lt;td&gt;Per-user gen.&lt;/td&gt; 
      &lt;td&gt;Total throughput&lt;/td&gt; 
      &lt;td&gt;TTFT&lt;/td&gt; 
      &lt;td&gt;ITL&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA HGX B200&lt;/td&gt; 
      &lt;td&gt;1,345 tok/s&lt;/td&gt; 
      &lt;td&gt;42.0 tok/s/user&lt;/td&gt; 
      &lt;td&gt;6,727 tok/s&lt;/td&gt; 
      &lt;td&gt;1,073ms&lt;/td&gt; 
      &lt;td&gt;59ms&lt;/td&gt; 
     &lt;/tr&gt; 
    &lt;/tbody&gt; 
   &lt;/table&gt; 
  &lt;/div&gt; 
 &lt;/div&gt; 
 &lt;div class="tab-content"&gt; 
  &lt;div class="table-wrapper"&gt; 
   &lt;table&gt; 
    &lt;tbody&gt; 
     &lt;tr&gt; 
      &lt;td&gt;Hardware&lt;/td&gt; 
      &lt;td&gt;Gen. throughput&lt;/td&gt; 
      &lt;td&gt;Per-user gen.&lt;/td&gt; 
      &lt;td&gt;Total throughput&lt;/td&gt; 
      &lt;td&gt;TTFT&lt;/td&gt; 
      &lt;td&gt;ITL&lt;/td&gt; 
     &lt;/tr&gt; 
     &lt;tr&gt; 
      &lt;td&gt;1× NVIDIA HGX B200&lt;/td&gt; 
      &lt;td&gt;1,265 tok/s&lt;/td&gt; 
      &lt;td&gt;39.5 tok/s/user&lt;/td&gt; 
      &lt;td&gt;6,327 tok/s&lt;/td&gt; 
      &lt;td&gt;1,317ms&lt;/td&gt; 
      &lt;td&gt;58ms&lt;/td&gt; 
     &lt;/tr&gt; 
    &lt;/tbody&gt; 
   &lt;/table&gt; 
  &lt;/div&gt; 
 &lt;/div&gt; 
&lt;/div&gt;  
&lt;img src="https://track.hubspot.com/__ptq.gif?a=21998649&amp;amp;k=14&amp;amp;r=https%3A%2F%2Flambda.ai%2Finference-models%2Fzai-org%2Fglm-5.1&amp;amp;bu=https%253A%252F%252Flambda.ai%252Finference-models&amp;amp;bvt=rss" alt="" width="1" height="1" style="min-height:1px!important;width:1px!important;border-width:0!important;margin-top:0!important;margin-bottom:0!important;margin-right:0!important;margin-left:0!important;padding-top:0!important;padding-bottom:0!important;padding-right:0!important;padding-left:0!important; "&gt;</content:encoded>
      <pubDate>Tue, 07 Apr 2026 16:15:31 GMT</pubDate>
      <guid>https://lambda.ai/inference-models/zai-org/glm-5.1</guid>
      <dc:date>2026-04-07T16:15:31Z</dc:date>
      <dc:creator>Lambda</dc:creator>
    </item>
    <item>
      <title>nvidia/NVIDIA-Nemotron-3-Super-120B-A12B</title>
      <link>https://lambda.ai/inference-models/nvidia/nvidia-nemotron-3-super-120b-a12b</link>
      <description>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/nvidia/nvidia-nemotron-3-super-120b-a12b" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-nemotron-3-super-on-lambda-1773333896548.png" alt="How to deploy Nemotron 3 Super on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;h3&gt;vLLM&lt;/h3&gt; 
&lt;div class="table-wrapper"&gt; 
 &lt;table&gt; 
  &lt;thead&gt; 
   &lt;tr&gt; 
    &lt;th&gt;Hardware&lt;/th&gt; 
    &lt;th&gt;Gen. throughput&lt;/th&gt; 
    &lt;th&gt;TTFT&lt;/th&gt; 
    &lt;th&gt;ITL&lt;/th&gt; 
   &lt;/tr&gt; 
  &lt;/thead&gt; 
  &lt;tbody&gt; 
   &lt;tr&gt; 
    &lt;td&gt;2× NVIDIA B200 GPUs (NVFP4)&lt;/td&gt; 
    &lt;td&gt;2,057 tok/s&lt;/td&gt; 
    &lt;td&gt;4,040ms&lt;/td&gt; 
    &lt;td&gt;12ms&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;1× NVIDIA B200 GPU (NVFP4)&lt;/td&gt; 
    &lt;td&gt;1,517 tok/s&lt;/td&gt; 
    &lt;td&gt;4,455ms&lt;/td&gt; 
    &lt;td&gt;16ms&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;2× NVIDIA B200 GPUs (FP8)&lt;/td&gt; 
    &lt;td&gt;1,847 tok/s&lt;/td&gt; 
    &lt;td&gt;3,948ms&lt;/td&gt; 
    &lt;td&gt;13ms&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;2× NVIDIA H100 GPUs (FP8)&lt;/td&gt; 
    &lt;td&gt;1,116 tok/s&lt;/td&gt; 
    &lt;td&gt;4,557ms&lt;/td&gt; 
    &lt;td&gt;24ms&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;4× NVIDIA A100 GPUs (BF16)&lt;/td&gt; 
    &lt;td&gt;553 tok/s&lt;/td&gt; 
    &lt;td&gt;6,694ms&lt;/td&gt; 
    &lt;td&gt;51ms&lt;/td&gt; 
   &lt;/tr&gt; 
  &lt;/tbody&gt; 
 &lt;/table&gt; 
&lt;/div&gt;</description>
      <content:encoded>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/nvidia/nvidia-nemotron-3-super-120b-a12b" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-nemotron-3-super-on-lambda-1773333896548.png" alt="How to deploy Nemotron 3 Super on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput&lt;/h2&gt; 
&lt;h3&gt;vLLM&lt;/h3&gt; 
&lt;div class="table-wrapper"&gt; 
 &lt;table&gt; 
  &lt;thead&gt; 
   &lt;tr&gt; 
    &lt;th&gt;Hardware&lt;/th&gt; 
    &lt;th&gt;Gen. throughput&lt;/th&gt; 
    &lt;th&gt;TTFT&lt;/th&gt; 
    &lt;th&gt;ITL&lt;/th&gt; 
   &lt;/tr&gt; 
  &lt;/thead&gt; 
  &lt;tbody&gt; 
   &lt;tr&gt; 
    &lt;td&gt;2× NVIDIA B200 GPUs (NVFP4)&lt;/td&gt; 
    &lt;td&gt;2,057 tok/s&lt;/td&gt; 
    &lt;td&gt;4,040ms&lt;/td&gt; 
    &lt;td&gt;12ms&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;1× NVIDIA B200 GPU (NVFP4)&lt;/td&gt; 
    &lt;td&gt;1,517 tok/s&lt;/td&gt; 
    &lt;td&gt;4,455ms&lt;/td&gt; 
    &lt;td&gt;16ms&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;2× NVIDIA B200 GPUs (FP8)&lt;/td&gt; 
    &lt;td&gt;1,847 tok/s&lt;/td&gt; 
    &lt;td&gt;3,948ms&lt;/td&gt; 
    &lt;td&gt;13ms&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;2× NVIDIA H100 GPUs (FP8)&lt;/td&gt; 
    &lt;td&gt;1,116 tok/s&lt;/td&gt; 
    &lt;td&gt;4,557ms&lt;/td&gt; 
    &lt;td&gt;24ms&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;4× NVIDIA A100 GPUs (BF16)&lt;/td&gt; 
    &lt;td&gt;553 tok/s&lt;/td&gt; 
    &lt;td&gt;6,694ms&lt;/td&gt; 
    &lt;td&gt;51ms&lt;/td&gt; 
   &lt;/tr&gt; 
  &lt;/tbody&gt; 
 &lt;/table&gt; 
&lt;/div&gt;  
&lt;img src="https://track.hubspot.com/__ptq.gif?a=21998649&amp;amp;k=14&amp;amp;r=https%3A%2F%2Flambda.ai%2Finference-models%2Fnvidia%2Fnvidia-nemotron-3-super-120b-a12b&amp;amp;bu=https%253A%252F%252Flambda.ai%252Finference-models&amp;amp;bvt=rss" alt="" width="1" height="1" style="min-height:1px!important;width:1px!important;border-width:0!important;margin-top:0!important;margin-bottom:0!important;margin-right:0!important;margin-left:0!important;padding-top:0!important;padding-bottom:0!important;padding-right:0!important;padding-left:0!important; "&gt;</content:encoded>
      <pubDate>Thu, 12 Mar 2026 16:48:32 GMT</pubDate>
      <guid>https://lambda.ai/inference-models/nvidia/nvidia-nemotron-3-super-120b-a12b</guid>
      <dc:date>2026-03-12T16:48:32Z</dc:date>
      <dc:creator>Lambda</dc:creator>
    </item>
    <item>
      <title>allenai/Olmo-Hybrid-Instruct-DPO-7B</title>
      <link>https://lambda.ai/inference-models/allenai/olmo-hybrid-instruct-dpo-7b</link>
      <description>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/allenai/olmo-hybrid-instruct-dpo-7b" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-olmo-hybrid-7b-on-lambda-1772833629145.png" alt="How to deploy OLMo Hybrid 7B on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput on vLLM&lt;/h2&gt; 
&lt;div class="table-wrapper"&gt; 
 &lt;table&gt; 
  &lt;thead&gt; 
   &lt;tr&gt; 
    &lt;th&gt;Hardware&lt;/th&gt; 
    &lt;th&gt;Gen. throughput&lt;/th&gt; 
    &lt;th&gt;TTFT&lt;/th&gt; 
    &lt;th&gt;ITL&lt;/th&gt; 
   &lt;/tr&gt; 
  &lt;/thead&gt; 
  &lt;tbody&gt; 
   &lt;tr&gt; 
    &lt;td&gt;1× NVIDIA B200 GPU&lt;/td&gt; 
    &lt;td&gt;1,765 tok/s&lt;/td&gt; 
    &lt;td&gt;4,424ms&lt;/td&gt; 
    &lt;td&gt;14ms&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;1× NVIDIA H100 GPU&lt;/td&gt; 
    &lt;td&gt;1,066 tok/s&lt;/td&gt; 
    &lt;td&gt;4,665ms&lt;/td&gt; 
    &lt;td&gt;25ms&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;1× NVIDIA A100 GPU&lt;/td&gt; 
    &lt;td&gt;551 tok/s&lt;/td&gt; 
    &lt;td&gt;7,191ms&lt;/td&gt; 
    &lt;td&gt;51ms&lt;/td&gt; 
   &lt;/tr&gt; 
  &lt;/tbody&gt; 
 &lt;/table&gt; 
&lt;/div&gt;</description>
      <content:encoded>&lt;div class="hs-featured-image-wrapper"&gt; 
 &lt;a href="https://lambda.ai/inference-models/allenai/olmo-hybrid-instruct-dpo-7b" title="" class="hs-featured-image-link"&gt; &lt;img src="https://lambda.ai/hubfs/web-static/images/llm-pages/llm-how-to-deploy-olmo-hybrid-7b-on-lambda-1772833629145.png" alt="How to deploy OLMo Hybrid 7B on Lambda featured image" class="hs-featured-image" style="width:auto !important; max-width:50%; float:left; margin:0 15px 15px 0;"&gt; &lt;/a&gt; 
&lt;/div&gt; 
&lt;h2&gt;TL;DR: token throughput on vLLM&lt;/h2&gt; 
&lt;div class="table-wrapper"&gt; 
 &lt;table&gt; 
  &lt;thead&gt; 
   &lt;tr&gt; 
    &lt;th&gt;Hardware&lt;/th&gt; 
    &lt;th&gt;Gen. throughput&lt;/th&gt; 
    &lt;th&gt;TTFT&lt;/th&gt; 
    &lt;th&gt;ITL&lt;/th&gt; 
   &lt;/tr&gt; 
  &lt;/thead&gt; 
  &lt;tbody&gt; 
   &lt;tr&gt; 
    &lt;td&gt;1× NVIDIA B200 GPU&lt;/td&gt; 
    &lt;td&gt;1,765 tok/s&lt;/td&gt; 
    &lt;td&gt;4,424ms&lt;/td&gt; 
    &lt;td&gt;14ms&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;1× NVIDIA H100 GPU&lt;/td&gt; 
    &lt;td&gt;1,066 tok/s&lt;/td&gt; 
    &lt;td&gt;4,665ms&lt;/td&gt; 
    &lt;td&gt;25ms&lt;/td&gt; 
   &lt;/tr&gt; 
   &lt;tr&gt; 
    &lt;td&gt;1× NVIDIA A100 GPU&lt;/td&gt; 
    &lt;td&gt;551 tok/s&lt;/td&gt; 
    &lt;td&gt;7,191ms&lt;/td&gt; 
    &lt;td&gt;51ms&lt;/td&gt; 
   &lt;/tr&gt; 
  &lt;/tbody&gt; 
 &lt;/table&gt; 
&lt;/div&gt;  
&lt;img src="https://track.hubspot.com/__ptq.gif?a=21998649&amp;amp;k=14&amp;amp;r=https%3A%2F%2Flambda.ai%2Finference-models%2Fallenai%2Folmo-hybrid-instruct-dpo-7b&amp;amp;bu=https%253A%252F%252Flambda.ai%252Finference-models&amp;amp;bvt=rss" alt="" width="1" height="1" style="min-height:1px!important;width:1px!important;border-width:0!important;margin-top:0!important;margin-bottom:0!important;margin-right:0!important;margin-left:0!important;padding-top:0!important;padding-bottom:0!important;padding-right:0!important;padding-left:0!important; "&gt;</content:encoded>
      <pubDate>Fri, 06 Mar 2026 21:49:02 GMT</pubDate>
      <guid>https://lambda.ai/inference-models/allenai/olmo-hybrid-instruct-dpo-7b</guid>
      <dc:date>2026-03-06T21:49:02Z</dc:date>
      <dc:creator>Lambda</dc:creator>
    </item>
  </channel>
</rss>
