""" Hugging Face Agent Integration for OpenManus Extends the main AI agent with access to thousands of HuggingFace models """ import os from typing import Any, Dict, List, Optional from app.agent.base import BaseAgent from app.huggingface_models import ModelCategory from app.logger import logger from app.tool.huggingface_models_tool import HuggingFaceModelsTool class HuggingFaceAgent(BaseAgent): """AI Agent with integrated HuggingFace model access""" def __init__(self, **config): super().__init__(**config) # Initialize HuggingFace integration hf_token = os.getenv("HUGGINGFACE_TOKEN") or config.get("huggingface_token") if not hf_token: logger.warning( "No Hugging Face token provided. HF models will not be available." ) self.hf_tool = None else: self.hf_tool = HuggingFaceModelsTool(hf_token) # Default models for different tasks self.default_models = { "text_generation": "MiniMax-M2", # Latest high-performance model "image_generation": "FLUX.1 Dev", # Best quality image generation "speech_recognition": "Whisper Large v3", # Best multilingual ASR "text_to_speech": "Kokoro 82M", # High quality, lightweight TTS "image_classification": "ViT Base Patch16", # General image classification "embeddings": "Sentence Transformers All MiniLM", # Fast embeddings "translation": "M2M100 1.2B", # Multilingual translation "summarization": "PEGASUS XSum", # Abstractive summarization } async def generate_text_with_hf( self, prompt: str, model_name: Optional[str] = None, max_tokens: int = 200, temperature: float = 0.7, stream: bool = False, ) -> Dict[str, Any]: """Generate text using HuggingFace models""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or self.default_models["text_generation"] return await self.hf_tool.text_generation( model_name=model_name, prompt=prompt, max_tokens=max_tokens, temperature=temperature, stream=stream, ) async def generate_image_with_hf( self, prompt: str, model_name: Optional[str] = None, negative_prompt: Optional[str] = None, width: int = 1024, height: int = 1024, ) -> Dict[str, Any]: """Generate images using HuggingFace models""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or self.default_models["image_generation"] return await self.hf_tool.generate_image( model_name=model_name, prompt=prompt, negative_prompt=negative_prompt, width=width, height=height, ) async def transcribe_audio_with_hf( self, audio_data: bytes, model_name: Optional[str] = None, language: Optional[str] = None, ) -> Dict[str, Any]: """Transcribe audio using HuggingFace models""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or self.default_models["speech_recognition"] return await self.hf_tool.transcribe_audio( model_name=model_name, audio_data=audio_data, language=language ) async def synthesize_speech_with_hf( self, text: str, model_name: Optional[str] = None, voice_id: Optional[str] = None, ) -> Dict[str, Any]: """Generate speech from text using HuggingFace models""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or self.default_models["text_to_speech"] return await self.hf_tool.text_to_speech( model_name=model_name, text=text, voice_id=voice_id ) async def classify_image_with_hf( self, image_data: bytes, model_name: Optional[str] = None, task: str = "general" ) -> Dict[str, Any]: """Classify images using HuggingFace models""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} # Choose model based on task if task == "nsfw": model_name = "NSFW Image Detection" elif task == "emotions": model_name = "Facial Emotions Detection" elif task == "deepfake": model_name = "Deepfake Detection" else: model_name = model_name or self.default_models["image_classification"] return await self.hf_tool.classify_image( model_name=model_name, image_data=image_data ) async def get_text_embeddings_with_hf( self, texts: List[str], model_name: Optional[str] = None ) -> Dict[str, Any]: """Get text embeddings using HuggingFace models""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or self.default_models["embeddings"] return await self.hf_tool.get_embeddings(model_name=model_name, texts=texts) async def translate_with_hf( self, text: str, target_language: str, source_language: Optional[str] = None, model_name: Optional[str] = None, ) -> Dict[str, Any]: """Translate text using HuggingFace models""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or self.default_models["translation"] return await self.hf_tool.translate_text( model_name=model_name, text=text, source_language=source_language, target_language=target_language, ) async def summarize_with_hf( self, text: str, model_name: Optional[str] = None, max_length: int = 150 ) -> Dict[str, Any]: """Summarize text using HuggingFace models""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or self.default_models["summarization"] return await self.hf_tool.summarize_text( model_name=model_name, text=text, max_length=max_length ) def get_available_hf_models(self, category: Optional[str] = None) -> Dict[str, Any]: """Get list of available HuggingFace models""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} return self.hf_tool.list_available_models(category) async def smart_model_selection( self, task_description: str, content_type: str = "text" ) -> str: """ Intelligently select the best HuggingFace model for a task Args: task_description: Description of what the user wants to do content_type: Type of content (text, image, audio, video) """ task_lower = task_description.lower() # Video generation and processing if any( keyword in task_lower for keyword in [ "video", "movie", "animation", "motion", "gif", "sequence", "frames", ] ): if "generate" in task_lower or "create" in task_lower: return "Stable Video Diffusion" elif "analyze" in task_lower or "describe" in task_lower: return "Video ChatGPT" else: return "AnimateDiff" # Code and App Development elif any( keyword in task_lower for keyword in [ "code", "programming", "app", "application", "software", "develop", "build", "function", "class", "api", "database", "website", "frontend", "backend", ] ): if "app" in task_lower or "application" in task_lower: return "CodeLlama 34B Instruct" # Best for full applications elif "python" in task_lower: return "WizardCoder 34B" # Python specialist elif "api" in task_lower: return "StarCoder2 15B" # Good for APIs elif "explain" in task_lower or "comment" in task_lower: return "Phind CodeLlama" # Best for code explanation else: return "DeepSeek Coder V2" # General coding # 3D and AR/VR Content elif any( keyword in task_lower for keyword in [ "3d", "three dimensional", "mesh", "model", "obj", "stl", "ar", "vr", "augmented reality", "virtual reality", "texture", "material", ] ): if "text" in task_lower and ("3d" in task_lower or "model" in task_lower): return "Shap-E" elif "image" in task_lower and "3d" in task_lower: return "DreamFusion" else: return "Point-E" # Document Processing and OCR elif any( keyword in task_lower for keyword in [ "ocr", "document", "pdf", "scan", "extract text", "handwriting", "form", "table", "layout", "invoice", "receipt", "contract", ] ): if "handwriting" in task_lower or "handwritten" in task_lower: return "TrOCR Handwritten" elif "table" in task_lower: return "TableTransformer" elif "form" in task_lower: return "FormNet" else: return "TrOCR Large" # Multimodal AI elif any( keyword in task_lower for keyword in [ "visual question", "image question", "describe image", "multimodal", "vision language", "image text", "cross modal", ] ): if "chat" in task_lower or "conversation" in task_lower: return "GPT-4V" elif "question" in task_lower: return "LLaVA" else: return "BLIP-2" # Creative Content elif any( keyword in task_lower for keyword in [ "story", "creative", "poem", "poetry", "novel", "screenplay", "script", "blog", "article", "marketing", "copy", "advertising", ] ): if "story" in task_lower or "novel" in task_lower: return "Novel AI" elif "poem" in task_lower or "poetry" in task_lower: return "Poet Assistant" elif "marketing" in task_lower or "copy" in task_lower: return "Marketing Copy AI" else: return "GPT-3.5 Creative" # Game Development elif any( keyword in task_lower for keyword in [ "game", "character", "npc", "level", "dialogue", "asset", "quest", "gameplay", "mechanic", "unity", "unreal", ] ): if "character" in task_lower: return "Character AI" elif "level" in task_lower or "environment" in task_lower: return "Level Designer" elif "dialogue" in task_lower or "conversation" in task_lower: return "Dialogue Writer" else: return "Asset Creator" # Science and Research elif any( keyword in task_lower for keyword in [ "research", "scientific", "paper", "analysis", "data", "protein", "molecule", "chemistry", "biology", "physics", "experiment", ] ): if "protein" in task_lower or "folding" in task_lower: return "AlphaFold" elif "molecule" in task_lower or "chemistry" in task_lower: return "ChemBERTa" elif "data" in task_lower and "analysis" in task_lower: return "Data Analyst" else: return "SciBERT" # Business and Productivity elif any( keyword in task_lower for keyword in [ "email", "business", "report", "presentation", "meeting", "project", "plan", "proposal", "memo", "letter", "professional", ] ): if "email" in task_lower: return "Email Assistant" elif "presentation" in task_lower: return "Presentation AI" elif "report" in task_lower: return "Report Writer" elif "meeting" in task_lower: return "Meeting Summarizer" else: return "Project Planner" # Specialized AI elif any( keyword in task_lower for keyword in [ "music", "audio", "sound", "voice clone", "enhance", "restore", "upscale", "remove background", "inpaint", "style transfer", ] ): if "music" in task_lower: return "MusicGen" elif "voice" in task_lower and "clone" in task_lower: return "Voice Cloner" elif "upscale" in task_lower or "enhance" in task_lower: return "Real-ESRGAN" elif "background" in task_lower and "remove" in task_lower: return "Background Remover" elif "restore" in task_lower or "face" in task_lower: return "GFPGAN" else: return "LaMa" # Traditional categories elif any( keyword in task_lower for keyword in [ "generate", "write", "create", "compose", "chat", "conversation", ] ): if "chat" in task_lower or "conversation" in task_lower: return "Llama 3.1 8B Instruct" else: return "MiniMax-M2" # Image generation elif any( keyword in task_lower for keyword in ["image", "picture", "draw", "art", "photo", "visual"] ): if "fast" in task_lower or "quick" in task_lower: return "FLUX.1 Schnell" else: return "FLUX.1 Dev" # Audio processing elif any( keyword in task_lower for keyword in ["transcribe", "speech to text", "recognize", "audio"] ): if content_type == "audio" or "transcribe" in task_lower: return "Whisper Large v3" # Text-to-speech elif any( keyword in task_lower for keyword in ["speak", "voice", "text to speech", "tts"] ): if "fast" in task_lower: return "Kokoro 82M" # Lightweight and fast else: return "VibeVoice 1.5B" # High quality # Image analysis elif ( any( keyword in task_lower for keyword in ["classify", "analyze image", "detect", "recognize"] ) and content_type == "image" ): if "nsfw" in task_lower or "safe" in task_lower: return "NSFW Image Detection" elif "emotion" in task_lower or "face" in task_lower: return "Facial Emotions Detection" elif "deepfake" in task_lower or "fake" in task_lower: return "Deepfake Detection" else: return "ViT Base Patch16" # General classification # Translation elif any( keyword in task_lower for keyword in ["translate", "language", "convert"] ): return "M2M100 1.2B" # Multilingual translation # Summarization elif any( keyword in task_lower for keyword in ["summarize", "summary", "abstract", "brief"] ): return "PEGASUS XSum" # Best summarization # Embeddings/similarity elif any( keyword in task_lower for keyword in ["similar", "embed", "vector", "search", "match"] ): return "Sentence Transformers All MiniLM" # Fast embeddings # Default fallback else: return "MiniMax-M2" # Best general-purpose model async def execute_hf_task( self, task: str, content: Any, model_name: Optional[str] = None, **kwargs ) -> Dict[str, Any]: """ Execute any HuggingFace task with intelligent model selection Args: task: Task description (e.g., "generate image", "transcribe audio") content: Input content (text, image bytes, audio bytes) model_name: Specific model to use (optional) **kwargs: Additional parameters """ if not self.hf_tool: return {"error": "HuggingFace integration not available"} try: task_lower = task.lower() # Determine content type content_type = "text" if isinstance(content, bytes): if ( b"PNG" in content[:20] or b"JFIF" in content[:20] or b"GIF" in content[:20] ): content_type = "image" else: content_type = "audio" # Auto-select model if not specified if not model_name: model_name = await self.smart_model_selection(task, content_type) # Route to appropriate method based on task if "generate" in task_lower and ( "image" in task_lower or "picture" in task_lower ): return await self.generate_image_with_hf(content, model_name, **kwargs) elif "transcribe" in task_lower or "speech to text" in task_lower: return await self.transcribe_audio_with_hf( content, model_name, **kwargs ) elif "text to speech" in task_lower or "tts" in task_lower: return await self.synthesize_speech_with_hf( content, model_name, **kwargs ) elif "classify" in task_lower and content_type == "image": return await self.classify_image_with_hf(content, model_name, **kwargs) elif "embed" in task_lower or "vector" in task_lower: texts = [content] if isinstance(content, str) else content return await self.get_text_embeddings_with_hf(texts, model_name) elif "translate" in task_lower: return await self.translate_with_hf( content, model_name=model_name, **kwargs ) elif "summarize" in task_lower: return await self.summarize_with_hf(content, model_name, **kwargs) else: # Default to text generation return await self.generate_text_with_hf(content, model_name, **kwargs) except Exception as e: logger.error(f"HuggingFace task execution failed: {e}") return {"error": f"Task execution failed: {str(e)}"} async def chat_with_hf_models( self, message: str, conversation_history: List[Dict] = None ) -> Dict[str, Any]: """ Enhanced chat with access to HuggingFace models This method extends the base agent's capabilities with HF models """ # Check if the user is asking for HuggingFace-specific functionality message_lower = message.lower() # Handle model listing requests if "list" in message_lower and ( "model" in message_lower or "hf" in message_lower ): return self.get_available_hf_models() # Handle specific model requests hf_keywords = [ "generate image", "create image", "draw", "picture", "transcribe", "speech to text", "audio", "text to speech", "speak", "voice", "translate", "language", "classify image", "embed", "vector", "similarity", "summarize", ] if any(keyword in message_lower for keyword in hf_keywords): # This is likely a HuggingFace model request return await self.execute_hf_task(message, message) # For regular chat, we can enhance responses with HF models # First get a response from the base agent base_response = await super().chat(message, conversation_history) # Optionally enhance with HF capabilities if relevant if "image" in message_lower and "generate" in message_lower: # User might want image generation base_response["hf_suggestion"] = { "action": "generate_image", "models": ["FLUX.1 Dev", "FLUX.1 Schnell", "Stable Diffusion XL"], "message": "I can also generate images for you using HuggingFace models. Just ask!", } return base_response # New methods for expanded model categories async def generate_video_with_hf( self, prompt: str, model_name: Optional[str] = None, **kwargs ) -> Dict[str, Any]: """Generate video from text prompt""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or "Stable Video Diffusion" return await self.hf_tool.text_to_video( model_name=model_name, prompt=prompt, **kwargs ) async def generate_code_with_hf( self, prompt: str, language: str = "python", model_name: Optional[str] = None, **kwargs, ) -> Dict[str, Any]: """Generate code from natural language description""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or "CodeLlama 34B Instruct" return await self.hf_tool.code_generation( model_name=model_name, prompt=prompt, language=language, **kwargs ) async def generate_app_with_hf( self, description: str, app_type: str = "web_app", model_name: Optional[str] = None, **kwargs, ) -> Dict[str, Any]: """Generate complete application from description""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or "CodeLlama 34B Instruct" enhanced_prompt = f"Create a {app_type} application: {description}" return await self.hf_tool.code_generation( model_name=model_name, prompt=enhanced_prompt, **kwargs ) async def generate_3d_model_with_hf( self, prompt: str, model_name: Optional[str] = None, **kwargs ) -> Dict[str, Any]: """Generate 3D model from text description""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or "Shap-E" return await self.hf_tool.text_to_3d( model_name=model_name, prompt=prompt, **kwargs ) async def process_document_with_hf( self, document_data: bytes, task_type: str = "ocr", model_name: Optional[str] = None, **kwargs, ) -> Dict[str, Any]: """Process documents with OCR and analysis""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} if task_type == "ocr": model_name = model_name or "TrOCR Large" return await self.hf_tool.ocr( model_name=model_name, image_data=document_data, **kwargs ) else: model_name = model_name or "LayoutLMv3" return await self.hf_tool.document_analysis( model_name=model_name, document_data=document_data, **kwargs ) async def multimodal_chat_with_hf( self, image_data: bytes, text: str, model_name: Optional[str] = None, **kwargs ) -> Dict[str, Any]: """Chat with images using multimodal models""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or "BLIP-2" return await self.hf_tool.vision_language( model_name=model_name, image_data=image_data, text=text, **kwargs ) async def generate_music_with_hf( self, prompt: str, duration: int = 30, model_name: Optional[str] = None, **kwargs, ) -> Dict[str, Any]: """Generate music from text description""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or "MusicGen" return await self.hf_tool.music_generation( model_name=model_name, prompt=prompt, duration=duration, **kwargs ) async def enhance_image_with_hf( self, image_data: bytes, task_type: str = "super_resolution", model_name: Optional[str] = None, **kwargs, ) -> Dict[str, Any]: """Enhance images with various AI models""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} if task_type == "super_resolution": model_name = model_name or "Real-ESRGAN" return await self.hf_tool.super_resolution( model_name=model_name, image_data=image_data, **kwargs ) elif task_type == "background_removal": model_name = model_name or "Background Remover" return await self.hf_tool.background_removal( model_name=model_name, image_data=image_data, **kwargs ) elif task_type == "face_restoration": model_name = model_name or "GFPGAN" return await self.hf_tool.super_resolution( model_name=model_name, image_data=image_data, **kwargs ) async def generate_creative_content_with_hf( self, prompt: str, content_type: str = "story", model_name: Optional[str] = None, **kwargs, ) -> Dict[str, Any]: """Generate creative content like stories, poems, etc.""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or "GPT-3.5 Creative" enhanced_prompt = f"Write a {content_type}: {prompt}" return await self.hf_tool.creative_writing( model_name=model_name, prompt=enhanced_prompt, **kwargs ) async def generate_game_content_with_hf( self, description: str, content_type: str = "character", model_name: Optional[str] = None, **kwargs, ) -> Dict[str, Any]: """Generate game development content""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or "Character AI" enhanced_prompt = f"Create game {content_type}: {description}" return await self.hf_tool.creative_writing( model_name=model_name, prompt=enhanced_prompt, **kwargs ) async def generate_business_document_with_hf( self, context: str, document_type: str = "email", model_name: Optional[str] = None, **kwargs, ) -> Dict[str, Any]: """Generate business documents and content""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or "Email Assistant" return await self.hf_tool.business_document( model_name=model_name, document_type=document_type, context=context, **kwargs, ) async def research_assistance_with_hf( self, topic: str, research_type: str = "analysis", model_name: Optional[str] = None, **kwargs, ) -> Dict[str, Any]: """Research assistance and scientific content generation""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} model_name = model_name or "SciBERT" enhanced_prompt = f"Research {research_type} on: {topic}" return await self.hf_tool.text_generation( model_name=model_name, prompt=enhanced_prompt, **kwargs ) def get_available_hf_models(self, category: Optional[str] = None) -> Dict[str, Any]: """Get available models by category""" if not self.hf_tool: return {"error": "HuggingFace integration not available"} return self.hf_tool.list_available_models(category=category)