Spaces:

Kwaipilot
/

HIPO-8B

Paused

App Files Files Community

HIPO-8B / models.py

shunxing1234

Create models.py

28cdf77 verified 4 months ago

raw

history blame contribute delete

4.3 kB

	import spaces
	import torch
	import numpy as np
	from typing import Generator
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from config import MODEL_NAME, MAX_NEW_TOKENS, TEMPERATURE, DO_SAMPLE

	# Global variables to store the model and tokenizer
	tokenizer = None
	model = None

	def initialize_model():
	"""Initializes and loads the model and tokenizer once onto the GPU."""
	global tokenizer, model
	if model is None:
	try:
	print(f"Loading model {MODEL_NAME}...")

	# Use bfloat16 for efficiency on modern GPUs
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=dtype,
	device_map="auto"
	)
	model.eval()

	# Set padding token if not defined
	if tokenizer.pad_token_id is None:
	tokenizer.pad_token_id = tokenizer.eos_token_id

	print("Model loaded successfully.")
	except Exception as e:
	print(f"Failed to load model: {e}")
	raise
	return tokenizer, model

	# Call initialization
	try:
	initialize_model()
	except Exception as e:
	print(f"Warning: Global model initialization failed: {e}")

	@spaces.GPU(duration=120)
	def stream_generate_response(prompt: str, history: list) -> Generator[str, None, None]:
	"""
	Generates a response from the KAT model with proper streaming.
	"""
	global tokenizer, model

	# Fallback initialization
	if model is None or tokenizer is None:
	initialize_model()

	# Convert Gradio history format to the model's chat template format
	messages = []
	for human, bot in history:
	if human:
	messages.append({"role": "user", "content": human})
	if bot:
	messages.append({"role": "assistant", "content": bot})

	# Add the current prompt
	messages.append({"role": "user", "content": prompt})

	# Apply chat template
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	)

	# Tokenize with attention mask
	inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
	input_ids = inputs.input_ids.to(model.device)
	attention_mask = inputs.attention_mask.to(model.device)

	# Store initial input length
	initial_length = input_ids.shape[-1]

	# Generate with streaming using yield-based approach
	accumulated_text = ""
	generated_tokens = 0

	# Generate tokens incrementally
	while generated_tokens < MAX_NEW_TOKENS:
	with torch.no_grad():
	outputs = model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	return_dict=True
	)

	# Get next token probabilities
	next_token_logits = outputs.logits[:, -1, :]

	# Apply temperature
	if TEMPERATURE > 0:
	next_token_logits = next_token_logits / TEMPERATURE

	# Apply softmax and sample
	probs = torch.softmax(next_token_logits, dim=-1)
	if DO_SAMPLE:
	next_token = torch.multinomial(probs, num_samples=1)
	else:
	next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)

	# Check for EOS token
	if next_token.item() == tokenizer.eos_token_id:
	break

	# Decode the new token
	new_token_text = tokenizer.decode(next_token[0], skip_special_tokens=True)

	# Update accumulated text
	accumulated_text += new_token_text

	# Yield the current accumulated text
	yield accumulated_text

	# Prepare for next iteration
	input_ids = torch.cat([input_ids, next_token], dim=-1)
	attention_mask = torch.cat([attention_mask, torch.ones_like(next_token)], dim=-1)

	# Increment generated tokens counter
	generated_tokens += 1

	# Final yield to ensure complete text
	if accumulated_text:
	yield accumulated_text.strip()