72 lines
2.1 KiB
Python
72 lines
2.1 KiB
Python
|
|
from llama_index.llms.xinference import Xinference
|
|
from typing import Any, Callable, Dict, Optional, Sequence, Tuple
|
|
from llama_index.core.llms.callbacks import (
|
|
llm_chat_callback,
|
|
llm_completion_callback,
|
|
)
|
|
from llama_index.core.base.llms.types import (
|
|
ChatMessage,
|
|
ChatResponse,
|
|
ChatResponseGen,
|
|
CompletionResponse,
|
|
CompletionResponseGen,
|
|
LLMMetadata,
|
|
MessageRole,
|
|
)
|
|
from llama_index.llms.xinference.utils import (
|
|
xinference_message_to_history,
|
|
xinference_modelname_to_contextsize,
|
|
)
|
|
|
|
class XinfengModel(Xinference):
|
|
@llm_chat_callback()
|
|
def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
|
|
assert self._generator is not None
|
|
response_text = self._generator.chat(
|
|
messages=messages,
|
|
generate_config={
|
|
"stream": False,
|
|
"temperature": self.temperature,
|
|
"max_tokens": self.max_tokens,
|
|
},
|
|
)["choices"][0]["message"]["content"]
|
|
return ChatResponse(
|
|
message=ChatMessage(
|
|
role=MessageRole.ASSISTANT,
|
|
content=response_text,
|
|
),
|
|
delta=None,
|
|
)
|
|
|
|
@llm_chat_callback()
|
|
def stream_chat(
|
|
self, messages: Sequence[ChatMessage], **kwargs: Any
|
|
) -> ChatResponseGen:
|
|
msgs = []
|
|
for message in messages:
|
|
msgs.append(message.dict())
|
|
assert self._generator is not None
|
|
response_iter = self._generator.chat(
|
|
messages=msgs,
|
|
generate_config={
|
|
"stream": True,
|
|
"temperature": self.temperature,
|
|
"max_tokens": self.max_tokens,
|
|
},
|
|
)
|
|
|
|
def gen() -> ChatResponseGen:
|
|
text = ""
|
|
for c in response_iter:
|
|
delta = c["choices"][0]["delta"].get("content", "")
|
|
text += delta
|
|
yield ChatResponse(
|
|
message=ChatMessage(
|
|
role=MessageRole.ASSISTANT,
|
|
content=text,
|
|
),
|
|
delta=delta,
|
|
)
|
|
|
|
return gen() |