【vLLM 学习】Api Client
vLLM 是一款专为大语言模型推理加速而设计的框架实现了 KV 缓存内存几乎零浪费解决了内存管理瓶颈问题。更多 vLLM 中文文档及教程可访问 →go.hyper.ai/Wa62f*在线运行 vLLM 入门教程零基础分步指南源码 examples/online_serving/api_client.py# SPDX-License-Identifier: Apache-2.0 示例 python 客户端vllm.entrypoints.api_server 注意: API 服务器仅用于演示和简单性能基准测试。它不用于生产。 为了生产使用我们建议 vllm serve和 OpenAi 客户端 API。 import argparse import json from collections.abc import Iterable import requests def clear_line(n: int 1) - None: LINE_UP \033[1A LINE_CLEAR \x1b[2K for _ in range(n): print(LINE_UP, endLINE_CLEAR, flushTrue) def post_http_request(prompt: str, api_url: str, n: int 1, stream: bool False) - requests.Response: headers {User-Agent: Test Client} pload { prompt: prompt, n: n, use_beam_search: True, temperature: 0.0, max_tokens: 16, stream: stream, } response requests.post(api_url, headersheaders, jsonpload, streamstream) return response def get_streaming_response(response: requests.Response) - Iterable[list[str]]: for chunk in response.iter_lines(chunk_size8192, decode_unicodeFalse, delimiterb\n): if chunk: data json.loads(chunk.decode(utf-8)) output data[text] yield output def get_response(response: requests.Response) - list[str]: data json.loads(response.content) output data[text] return output if __name__ __main__: parser argparse.ArgumentParser() parser.add_argument(--host, typestr, defaultlocalhost) parser.add_argument(--port, typeint, default8000) parser.add_argument(--n, typeint, default4) parser.add_argument(--prompt, typestr, defaultSan Francisco is a) parser.add_argument(--stream, actionstore_true) args parser.parse_args() prompt args.prompt api_url fhttp://{args.host}:{args.port}/generate n args.n stream args.stream print(fPrompt: {prompt!r}\n, flushTrue) response post_http_request(prompt, api_url, n, stream) if stream: num_printed_lines 0 for h in get_streaming_response(response): clear_line(num_printed_lines) num_printed_lines 0 for i, line in enumerate(h): num_printed_lines 1 print(fBeam candidate {i}: {line!r}, flushTrue) else: output get_response(response) for i, line in enumerate(output): print(fBeam candidate {i}: {line!r}, flushTrue)