Spaces:

Speedofmastery
/

orynxml-agents

Paused

App Files Files Community

orynxml-agents / app /agent /huggingface_agent.py

Speedofmastery

Upload folder using huggingface_hub

88f3fce verified about 1 month ago

raw

history blame contribute delete

31.6 kB

	"""
	Hugging Face Agent Integration for OpenManus
	Extends the main AI agent with access to thousands of HuggingFace models
	"""

	import os
	from typing import Any, Dict, List, Optional

	from app.agent.base import BaseAgent
	from app.huggingface_models import ModelCategory
	from app.logger import logger
	from app.tool.huggingface_models_tool import HuggingFaceModelsTool


	class HuggingFaceAgent(BaseAgent):
	"""AI Agent with integrated HuggingFace model access"""

	def __init__(self, **config):
	super().__init__(**config)

	# Initialize HuggingFace integration
	hf_token = os.getenv("HUGGINGFACE_TOKEN") or config.get("huggingface_token")
	if not hf_token:
	logger.warning(
	"No Hugging Face token provided. HF models will not be available."
	)
	self.hf_tool = None
	else:
	self.hf_tool = HuggingFaceModelsTool(hf_token)

	# Default models for different tasks
	self.default_models = {
	"text_generation": "MiniMax-M2", # Latest high-performance model
	"image_generation": "FLUX.1 Dev", # Best quality image generation
	"speech_recognition": "Whisper Large v3", # Best multilingual ASR
	"text_to_speech": "Kokoro 82M", # High quality, lightweight TTS
	"image_classification": "ViT Base Patch16", # General image classification
	"embeddings": "Sentence Transformers All MiniLM", # Fast embeddings
	"translation": "M2M100 1.2B", # Multilingual translation
	"summarization": "PEGASUS XSum", # Abstractive summarization
	}

	async def generate_text_with_hf(
	self,
	prompt: str,
	model_name: Optional[str] = None,
	max_tokens: int = 200,
	temperature: float = 0.7,
	stream: bool = False,
	) -> Dict[str, Any]:
	"""Generate text using HuggingFace models"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or self.default_models["text_generation"]

	return await self.hf_tool.text_generation(
	model_name=model_name,
	prompt=prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	stream=stream,
	)

	async def generate_image_with_hf(
	self,
	prompt: str,
	model_name: Optional[str] = None,
	negative_prompt: Optional[str] = None,
	width: int = 1024,
	height: int = 1024,
	) -> Dict[str, Any]:
	"""Generate images using HuggingFace models"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or self.default_models["image_generation"]

	return await self.hf_tool.generate_image(
	model_name=model_name,
	prompt=prompt,
	negative_prompt=negative_prompt,
	width=width,
	height=height,
	)

	async def transcribe_audio_with_hf(
	self,
	audio_data: bytes,
	model_name: Optional[str] = None,
	language: Optional[str] = None,
	) -> Dict[str, Any]:
	"""Transcribe audio using HuggingFace models"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or self.default_models["speech_recognition"]

	return await self.hf_tool.transcribe_audio(
	model_name=model_name, audio_data=audio_data, language=language
	)

	async def synthesize_speech_with_hf(
	self,
	text: str,
	model_name: Optional[str] = None,
	voice_id: Optional[str] = None,
	) -> Dict[str, Any]:
	"""Generate speech from text using HuggingFace models"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or self.default_models["text_to_speech"]

	return await self.hf_tool.text_to_speech(
	model_name=model_name, text=text, voice_id=voice_id
	)

	async def classify_image_with_hf(
	self, image_data: bytes, model_name: Optional[str] = None, task: str = "general"
	) -> Dict[str, Any]:
	"""Classify images using HuggingFace models"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	# Choose model based on task
	if task == "nsfw":
	model_name = "NSFW Image Detection"
	elif task == "emotions":
	model_name = "Facial Emotions Detection"
	elif task == "deepfake":
	model_name = "Deepfake Detection"
	else:
	model_name = model_name or self.default_models["image_classification"]

	return await self.hf_tool.classify_image(
	model_name=model_name, image_data=image_data
	)

	async def get_text_embeddings_with_hf(
	self, texts: List[str], model_name: Optional[str] = None
	) -> Dict[str, Any]:
	"""Get text embeddings using HuggingFace models"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or self.default_models["embeddings"]

	return await self.hf_tool.get_embeddings(model_name=model_name, texts=texts)

	async def translate_with_hf(
	self,
	text: str,
	target_language: str,
	source_language: Optional[str] = None,
	model_name: Optional[str] = None,
	) -> Dict[str, Any]:
	"""Translate text using HuggingFace models"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or self.default_models["translation"]

	return await self.hf_tool.translate_text(
	model_name=model_name,
	text=text,
	source_language=source_language,
	target_language=target_language,
	)

	async def summarize_with_hf(
	self, text: str, model_name: Optional[str] = None, max_length: int = 150
	) -> Dict[str, Any]:
	"""Summarize text using HuggingFace models"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or self.default_models["summarization"]

	return await self.hf_tool.summarize_text(
	model_name=model_name, text=text, max_length=max_length
	)

	def get_available_hf_models(self, category: Optional[str] = None) -> Dict[str, Any]:
	"""Get list of available HuggingFace models"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	return self.hf_tool.list_available_models(category)

	async def smart_model_selection(
	self, task_description: str, content_type: str = "text"
	) -> str:
	"""
	Intelligently select the best HuggingFace model for a task

	Args:
	task_description: Description of what the user wants to do
	content_type: Type of content (text, image, audio, video)
	"""
	task_lower = task_description.lower()

	# Video generation and processing
	if any(
	keyword in task_lower
	for keyword in [
	"video",
	"movie",
	"animation",
	"motion",
	"gif",
	"sequence",
	"frames",
	]
	):
	if "generate" in task_lower or "create" in task_lower:
	return "Stable Video Diffusion"
	elif "analyze" in task_lower or "describe" in task_lower:
	return "Video ChatGPT"
	else:
	return "AnimateDiff"

	# Code and App Development
	elif any(
	keyword in task_lower
	for keyword in [
	"code",
	"programming",
	"app",
	"application",
	"software",
	"develop",
	"build",
	"function",
	"class",
	"api",
	"database",
	"website",
	"frontend",
	"backend",
	]
	):
	if "app" in task_lower or "application" in task_lower:
	return "CodeLlama 34B Instruct" # Best for full applications
	elif "python" in task_lower:
	return "WizardCoder 34B" # Python specialist
	elif "api" in task_lower:
	return "StarCoder2 15B" # Good for APIs
	elif "explain" in task_lower or "comment" in task_lower:
	return "Phind CodeLlama" # Best for code explanation
	else:
	return "DeepSeek Coder V2" # General coding

	# 3D and AR/VR Content
	elif any(
	keyword in task_lower
	for keyword in [
	"3d",
	"three dimensional",
	"mesh",
	"model",
	"obj",
	"stl",
	"ar",
	"vr",
	"augmented reality",
	"virtual reality",
	"texture",
	"material",
	]
	):
	if "text" in task_lower and ("3d" in task_lower or "model" in task_lower):
	return "Shap-E"
	elif "image" in task_lower and "3d" in task_lower:
	return "DreamFusion"
	else:
	return "Point-E"

	# Document Processing and OCR
	elif any(
	keyword in task_lower
	for keyword in [
	"ocr",
	"document",
	"pdf",
	"scan",
	"extract text",
	"handwriting",
	"form",
	"table",
	"layout",
	"invoice",
	"receipt",
	"contract",
	]
	):
	if "handwriting" in task_lower or "handwritten" in task_lower:
	return "TrOCR Handwritten"
	elif "table" in task_lower:
	return "TableTransformer"
	elif "form" in task_lower:
	return "FormNet"
	else:
	return "TrOCR Large"

	# Multimodal AI
	elif any(
	keyword in task_lower
	for keyword in [
	"visual question",
	"image question",
	"describe image",
	"multimodal",
	"vision language",
	"image text",
	"cross modal",
	]
	):
	if "chat" in task_lower or "conversation" in task_lower:
	return "GPT-4V"
	elif "question" in task_lower:
	return "LLaVA"
	else:
	return "BLIP-2"

	# Creative Content
	elif any(
	keyword in task_lower
	for keyword in [
	"story",
	"creative",
	"poem",
	"poetry",
	"novel",
	"screenplay",
	"script",
	"blog",
	"article",
	"marketing",
	"copy",
	"advertising",
	]
	):
	if "story" in task_lower or "novel" in task_lower:
	return "Novel AI"
	elif "poem" in task_lower or "poetry" in task_lower:
	return "Poet Assistant"
	elif "marketing" in task_lower or "copy" in task_lower:
	return "Marketing Copy AI"
	else:
	return "GPT-3.5 Creative"

	# Game Development
	elif any(
	keyword in task_lower
	for keyword in [
	"game",
	"character",
	"npc",
	"level",
	"dialogue",
	"asset",
	"quest",
	"gameplay",
	"mechanic",
	"unity",
	"unreal",
	]
	):
	if "character" in task_lower:
	return "Character AI"
	elif "level" in task_lower or "environment" in task_lower:
	return "Level Designer"
	elif "dialogue" in task_lower or "conversation" in task_lower:
	return "Dialogue Writer"
	else:
	return "Asset Creator"

	# Science and Research
	elif any(
	keyword in task_lower
	for keyword in [
	"research",
	"scientific",
	"paper",
	"analysis",
	"data",
	"protein",
	"molecule",
	"chemistry",
	"biology",
	"physics",
	"experiment",
	]
	):
	if "protein" in task_lower or "folding" in task_lower:
	return "AlphaFold"
	elif "molecule" in task_lower or "chemistry" in task_lower:
	return "ChemBERTa"
	elif "data" in task_lower and "analysis" in task_lower:
	return "Data Analyst"
	else:
	return "SciBERT"

	# Business and Productivity
	elif any(
	keyword in task_lower
	for keyword in [
	"email",
	"business",
	"report",
	"presentation",
	"meeting",
	"project",
	"plan",
	"proposal",
	"memo",
	"letter",
	"professional",
	]
	):
	if "email" in task_lower:
	return "Email Assistant"
	elif "presentation" in task_lower:
	return "Presentation AI"
	elif "report" in task_lower:
	return "Report Writer"
	elif "meeting" in task_lower:
	return "Meeting Summarizer"
	else:
	return "Project Planner"

	# Specialized AI
	elif any(
	keyword in task_lower
	for keyword in [
	"music",
	"audio",
	"sound",
	"voice clone",
	"enhance",
	"restore",
	"upscale",
	"remove background",
	"inpaint",
	"style transfer",
	]
	):
	if "music" in task_lower:
	return "MusicGen"
	elif "voice" in task_lower and "clone" in task_lower:
	return "Voice Cloner"
	elif "upscale" in task_lower or "enhance" in task_lower:
	return "Real-ESRGAN"
	elif "background" in task_lower and "remove" in task_lower:
	return "Background Remover"
	elif "restore" in task_lower or "face" in task_lower:
	return "GFPGAN"
	else:
	return "LaMa"

	# Traditional categories
	elif any(
	keyword in task_lower
	for keyword in [
	"generate",
	"write",
	"create",
	"compose",
	"chat",
	"conversation",
	]
	):
	if "chat" in task_lower or "conversation" in task_lower:
	return "Llama 3.1 8B Instruct"
	else:
	return "MiniMax-M2"

	# Image generation
	elif any(
	keyword in task_lower
	for keyword in ["image", "picture", "draw", "art", "photo", "visual"]
	):
	if "fast" in task_lower or "quick" in task_lower:
	return "FLUX.1 Schnell"
	else:
	return "FLUX.1 Dev"

	# Audio processing
	elif any(
	keyword in task_lower
	for keyword in ["transcribe", "speech to text", "recognize", "audio"]
	):
	if content_type == "audio" or "transcribe" in task_lower:
	return "Whisper Large v3"

	# Text-to-speech
	elif any(
	keyword in task_lower
	for keyword in ["speak", "voice", "text to speech", "tts"]
	):
	if "fast" in task_lower:
	return "Kokoro 82M" # Lightweight and fast
	else:
	return "VibeVoice 1.5B" # High quality

	# Image analysis
	elif (
	any(
	keyword in task_lower
	for keyword in ["classify", "analyze image", "detect", "recognize"]
	)
	and content_type == "image"
	):
	if "nsfw" in task_lower or "safe" in task_lower:
	return "NSFW Image Detection"
	elif "emotion" in task_lower or "face" in task_lower:
	return "Facial Emotions Detection"
	elif "deepfake" in task_lower or "fake" in task_lower:
	return "Deepfake Detection"
	else:
	return "ViT Base Patch16" # General classification

	# Translation
	elif any(
	keyword in task_lower for keyword in ["translate", "language", "convert"]
	):
	return "M2M100 1.2B" # Multilingual translation

	# Summarization
	elif any(
	keyword in task_lower
	for keyword in ["summarize", "summary", "abstract", "brief"]
	):
	return "PEGASUS XSum" # Best summarization

	# Embeddings/similarity
	elif any(
	keyword in task_lower
	for keyword in ["similar", "embed", "vector", "search", "match"]
	):
	return "Sentence Transformers All MiniLM" # Fast embeddings

	# Default fallback
	else:
	return "MiniMax-M2" # Best general-purpose model

	async def execute_hf_task(
	self, task: str, content: Any, model_name: Optional[str] = None, **kwargs
	) -> Dict[str, Any]:
	"""
	Execute any HuggingFace task with intelligent model selection

	Args:
	task: Task description (e.g., "generate image", "transcribe audio")
	content: Input content (text, image bytes, audio bytes)
	model_name: Specific model to use (optional)
	**kwargs: Additional parameters
	"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	try:
	task_lower = task.lower()

	# Determine content type
	content_type = "text"
	if isinstance(content, bytes):
	if (
	b"PNG" in content[:20]
	or b"JFIF" in content[:20]
	or b"GIF" in content[:20]
	):
	content_type = "image"
	else:
	content_type = "audio"

	# Auto-select model if not specified
	if not model_name:
	model_name = await self.smart_model_selection(task, content_type)

	# Route to appropriate method based on task
	if "generate" in task_lower and (
	"image" in task_lower or "picture" in task_lower
	):
	return await self.generate_image_with_hf(content, model_name, **kwargs)

	elif "transcribe" in task_lower or "speech to text" in task_lower:
	return await self.transcribe_audio_with_hf(
	content, model_name, **kwargs
	)

	elif "text to speech" in task_lower or "tts" in task_lower:
	return await self.synthesize_speech_with_hf(
	content, model_name, **kwargs
	)

	elif "classify" in task_lower and content_type == "image":
	return await self.classify_image_with_hf(content, model_name, **kwargs)

	elif "embed" in task_lower or "vector" in task_lower:
	texts = [content] if isinstance(content, str) else content
	return await self.get_text_embeddings_with_hf(texts, model_name)

	elif "translate" in task_lower:
	return await self.translate_with_hf(
	content, model_name=model_name, **kwargs
	)

	elif "summarize" in task_lower:
	return await self.summarize_with_hf(content, model_name, **kwargs)

	else:
	# Default to text generation
	return await self.generate_text_with_hf(content, model_name, **kwargs)

	except Exception as e:
	logger.error(f"HuggingFace task execution failed: {e}")
	return {"error": f"Task execution failed: {str(e)}"}

	async def chat_with_hf_models(
	self, message: str, conversation_history: List[Dict] = None
	) -> Dict[str, Any]:
	"""
	Enhanced chat with access to HuggingFace models

	This method extends the base agent's capabilities with HF models
	"""
	# Check if the user is asking for HuggingFace-specific functionality
	message_lower = message.lower()

	# Handle model listing requests
	if "list" in message_lower and (
	"model" in message_lower or "hf" in message_lower
	):
	return self.get_available_hf_models()

	# Handle specific model requests
	hf_keywords = [
	"generate image",
	"create image",
	"draw",
	"picture",
	"transcribe",
	"speech to text",
	"audio",
	"text to speech",
	"speak",
	"voice",
	"translate",
	"language",
	"classify image",
	"embed",
	"vector",
	"similarity",
	"summarize",
	]

	if any(keyword in message_lower for keyword in hf_keywords):
	# This is likely a HuggingFace model request
	return await self.execute_hf_task(message, message)

	# For regular chat, we can enhance responses with HF models
	# First get a response from the base agent
	base_response = await super().chat(message, conversation_history)

	# Optionally enhance with HF capabilities if relevant
	if "image" in message_lower and "generate" in message_lower:
	# User might want image generation
	base_response["hf_suggestion"] = {
	"action": "generate_image",
	"models": ["FLUX.1 Dev", "FLUX.1 Schnell", "Stable Diffusion XL"],
	"message": "I can also generate images for you using HuggingFace models. Just ask!",
	}

	return base_response

	# New methods for expanded model categories

	async def generate_video_with_hf(
	self, prompt: str, model_name: Optional[str] = None, **kwargs
	) -> Dict[str, Any]:
	"""Generate video from text prompt"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or "Stable Video Diffusion"
	return await self.hf_tool.text_to_video(
	model_name=model_name, prompt=prompt, **kwargs
	)

	async def generate_code_with_hf(
	self,
	prompt: str,
	language: str = "python",
	model_name: Optional[str] = None,
	**kwargs,
	) -> Dict[str, Any]:
	"""Generate code from natural language description"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or "CodeLlama 34B Instruct"
	return await self.hf_tool.code_generation(
	model_name=model_name, prompt=prompt, language=language, **kwargs
	)

	async def generate_app_with_hf(
	self,
	description: str,
	app_type: str = "web_app",
	model_name: Optional[str] = None,
	**kwargs,
	) -> Dict[str, Any]:
	"""Generate complete application from description"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or "CodeLlama 34B Instruct"
	enhanced_prompt = f"Create a {app_type} application: {description}"
	return await self.hf_tool.code_generation(
	model_name=model_name, prompt=enhanced_prompt, **kwargs
	)

	async def generate_3d_model_with_hf(
	self, prompt: str, model_name: Optional[str] = None, **kwargs
	) -> Dict[str, Any]:
	"""Generate 3D model from text description"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or "Shap-E"
	return await self.hf_tool.text_to_3d(
	model_name=model_name, prompt=prompt, **kwargs
	)

	async def process_document_with_hf(
	self,
	document_data: bytes,
	task_type: str = "ocr",
	model_name: Optional[str] = None,
	**kwargs,
	) -> Dict[str, Any]:
	"""Process documents with OCR and analysis"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	if task_type == "ocr":
	model_name = model_name or "TrOCR Large"
	return await self.hf_tool.ocr(
	model_name=model_name, image_data=document_data, **kwargs
	)
	else:
	model_name = model_name or "LayoutLMv3"
	return await self.hf_tool.document_analysis(
	model_name=model_name, document_data=document_data, **kwargs
	)

	async def multimodal_chat_with_hf(
	self, image_data: bytes, text: str, model_name: Optional[str] = None, **kwargs
	) -> Dict[str, Any]:
	"""Chat with images using multimodal models"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or "BLIP-2"
	return await self.hf_tool.vision_language(
	model_name=model_name, image_data=image_data, text=text, **kwargs
	)

	async def generate_music_with_hf(
	self,
	prompt: str,
	duration: int = 30,
	model_name: Optional[str] = None,
	**kwargs,
	) -> Dict[str, Any]:
	"""Generate music from text description"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or "MusicGen"
	return await self.hf_tool.music_generation(
	model_name=model_name, prompt=prompt, duration=duration, **kwargs
	)

	async def enhance_image_with_hf(
	self,
	image_data: bytes,
	task_type: str = "super_resolution",
	model_name: Optional[str] = None,
	**kwargs,
	) -> Dict[str, Any]:
	"""Enhance images with various AI models"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	if task_type == "super_resolution":
	model_name = model_name or "Real-ESRGAN"
	return await self.hf_tool.super_resolution(
	model_name=model_name, image_data=image_data, **kwargs
	)
	elif task_type == "background_removal":
	model_name = model_name or "Background Remover"
	return await self.hf_tool.background_removal(
	model_name=model_name, image_data=image_data, **kwargs
	)
	elif task_type == "face_restoration":
	model_name = model_name or "GFPGAN"
	return await self.hf_tool.super_resolution(
	model_name=model_name, image_data=image_data, **kwargs
	)

	async def generate_creative_content_with_hf(
	self,
	prompt: str,
	content_type: str = "story",
	model_name: Optional[str] = None,
	**kwargs,
	) -> Dict[str, Any]:
	"""Generate creative content like stories, poems, etc."""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or "GPT-3.5 Creative"
	enhanced_prompt = f"Write a {content_type}: {prompt}"
	return await self.hf_tool.creative_writing(
	model_name=model_name, prompt=enhanced_prompt, **kwargs
	)

	async def generate_game_content_with_hf(
	self,
	description: str,
	content_type: str = "character",
	model_name: Optional[str] = None,
	**kwargs,
	) -> Dict[str, Any]:
	"""Generate game development content"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or "Character AI"
	enhanced_prompt = f"Create game {content_type}: {description}"
	return await self.hf_tool.creative_writing(
	model_name=model_name, prompt=enhanced_prompt, **kwargs
	)

	async def generate_business_document_with_hf(
	self,
	context: str,
	document_type: str = "email",
	model_name: Optional[str] = None,
	**kwargs,
	) -> Dict[str, Any]:
	"""Generate business documents and content"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or "Email Assistant"
	return await self.hf_tool.business_document(
	model_name=model_name,
	document_type=document_type,
	context=context,
	**kwargs,
	)

	async def research_assistance_with_hf(
	self,
	topic: str,
	research_type: str = "analysis",
	model_name: Optional[str] = None,
	**kwargs,
	) -> Dict[str, Any]:
	"""Research assistance and scientific content generation"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	model_name = model_name or "SciBERT"
	enhanced_prompt = f"Research {research_type} on: {topic}"
	return await self.hf_tool.text_generation(
	model_name=model_name, prompt=enhanced_prompt, **kwargs
	)

	def get_available_hf_models(self, category: Optional[str] = None) -> Dict[str, Any]:
	"""Get available models by category"""
	if not self.hf_tool:
	return {"error": "HuggingFace integration not available"}

	return self.hf_tool.list_available_models(category=category)