Instructions to use 1bitLLM/bitnet_b1_58-large with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use 1bitLLM/bitnet_b1_58-large with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="1bitLLM/bitnet_b1_58-large")# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("1bitLLM/bitnet_b1_58-large") model = AutoModelForCausalLM.from_pretrained("1bitLLM/bitnet_b1_58-large") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use 1bitLLM/bitnet_b1_58-large with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "1bitLLM/bitnet_b1_58-large" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "1bitLLM/bitnet_b1_58-large", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/1bitLLM/bitnet_b1_58-large
- SGLang
How to use 1bitLLM/bitnet_b1_58-large with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "1bitLLM/bitnet_b1_58-large" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "1bitLLM/bitnet_b1_58-large", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "1bitLLM/bitnet_b1_58-large" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "1bitLLM/bitnet_b1_58-large", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use 1bitLLM/bitnet_b1_58-large with Docker Model Runner:
docker model run hf.co/1bitLLM/bitnet_b1_58-large
| import os | |
| import json | |
| import argparse | |
| import torch | |
| import random | |
| import glog | |
| from lm_eval import evaluator | |
| from eval_utils import LMEvalAdaptor | |
| from .tokenization_bitnet import BitnetTokenizer | |
| from .modeling_bitnet import BitnetForCausalLM | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--seed', default=0, type=int) | |
| parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str) | |
| parser.add_argument('--batch_size', type=int, default=1, help='batch size') | |
| parser.add_argument("--tasks", type=str) | |
| parser.add_argument("--output_path", default=None, type=str) | |
| parser.add_argument('--num_fewshot', type=int, default=0) | |
| parser.add_argument('--ctx_size', default=2048, type=int) | |
| def main(args): | |
| model_str = args.hf_path | |
| model = BitnetForCausalLM.from_pretrained( | |
| args.hf_path, | |
| device_map='auto', | |
| low_cpu_mem_usage=True, | |
| use_flash_attention_2=True, | |
| torch_dtype=torch.float16, | |
| ).half() | |
| tokenizer = BitnetTokenizer.from_pretrained(args.hf_path, use_fast=False) | |
| glog.info('loaded model!') | |
| task_names = args.tasks.split(",") | |
| lm_eval_model = LMEvalAdaptor(model_str, model, tokenizer, args.batch_size, args.ctx_size) | |
| results = evaluator.simple_evaluate( | |
| model=lm_eval_model, | |
| tasks=task_names, | |
| batch_size=args.batch_size, | |
| no_cache=True, | |
| num_fewshot=args.num_fewshot, | |
| ) | |
| print(evaluator.make_table(results)) | |
| if args.output_path is not None: | |
| os.makedirs(os.path.dirname(args.output_path), exist_ok=True) | |
| # otherwise cannot save | |
| results["config"]["model"] = args.hf_path | |
| with open(args.output_path, "w") as f: | |
| json.dump(results, f, indent=2) | |
| if __name__ == '__main__': | |
| torch.set_grad_enabled(False) | |
| args = parser.parse_args() | |
| random.seed(args.seed) | |
| torch.random.manual_seed(args.seed) | |
| main(args) | |