from llama_index.llms.xinference import Xinference from typing import Any, Callable, Dict, Optional, Sequence, Tuple from llama_index.core.llms.callbacks import ( llm_chat_callback, llm_completion_callback, ) from llama_index.core.base.llms.types import ( ChatMessage, ChatResponse, ChatResponseGen, CompletionResponse, CompletionResponseGen, LLMMetadata, MessageRole, ) from llama_index.llms.xinference.utils import ( xinference_message_to_history, xinference_modelname_to_contextsize, ) class XinfengModel(Xinference): @llm_chat_callback() def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: msgs = [] assert self._generator is not None for message in messages: msgs.append(message.dict()) response_text = self._generator.chat( messages=msgs, generate_config={ "stream": False, "temperature": self.temperature, "max_tokens": self.max_tokens, }, )["choices"][0]["message"]["content"] return ChatResponse( message=ChatMessage( role=MessageRole.ASSISTANT, content=response_text, ), delta=None, ) @llm_chat_callback() def stream_chat( self, messages: Sequence[ChatMessage], **kwargs: Any ) -> ChatResponseGen: msgs = [] for message in messages: msgs.append(message.dict()) assert self._generator is not None response_iter = self._generator.chat( messages=msgs, generate_config={ "stream": True, "temperature": self.temperature, "max_tokens": self.max_tokens, }, ) def gen() -> ChatResponseGen: text = "" for c in response_iter: delta = c["choices"][0]["delta"].get("content", "") text += delta yield ChatResponse( message=ChatMessage( role=MessageRole.ASSISTANT, content=text, ), delta=delta, ) return gen() @llm_completion_callback() def complete( self, prompt: str, formatted: bool = False, **kwargs: Any ) -> CompletionResponse: assert self._generator is not None message = ChatMessage.from_str(prompt,MessageRole.SYSTEM) msgs = [message.dict()] response_text = self._generator.chat( messages=msgs, generate_config={ "stream": False, "temperature": self.temperature, "max_tokens": self.max_tokens, }, )["choices"][0]["message"]["content"] return CompletionResponse( delta=None, text=response_text, ) @llm_completion_callback() def stream_complete( self, prompt: str, formatted: bool = False, **kwargs: Any ) -> CompletionResponseGen: assert self._generator is not None message = ChatMessage.from_str(prompt,MessageRole.SYSTEM) msgs = [message.dict()] response_iter = self._generator.chat( messages=msgs, generate_config={ "stream": True, "temperature": self.temperature, "max_tokens": self.max_tokens, }, ) def gen() -> CompletionResponseGen: text = "" for c in response_iter: delta = c["choices"][0]["delta"].get("content", "") text += delta yield CompletionResponse( delta=delta, text=text, ) return gen()