Machine learning: Mistral Dolphin conversational bot with article ingestion


Intro

This article walks through a complex Python application involving several advanced libraries and APIs, utilized for constructing an AI-based conversational agent. Each section of the code is explained, followed by the corresponding code snippet enclosed within markdown code blocks.

1. Importing Libraries

The first block of code imports all necessary libraries and modules. Pydantic is used for data validation, Torch for deep learning models, and Gradio for creating web UIs for Python apps. Uvicorn serves as an ASGI server for running FastAPI applications, which are used here for the web service framework.

from pydantic import BaseModel
import torch
import gradio as gr
import uvicorn
from textwrap import fill
from IPython.display import Markdown, display

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from fastapi import FastAPI, HTTPException
from langchain_community.vectorstores import Chroma
from langchain.schema import AIMessage, HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain_core.prompts import PromptTemplate
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader, UnstructuredURLLoader
from langchain.chains import LLMChain, SimpleSequentialChain, RetrievalQA, ConversationalRetrievalChain

from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

import warnings

2. Data Model Definition

Defines a PromptRequest class using Pydantic for validating the structure of incoming data requests. This class expects a prompt of type string.

class PromptRequest(BaseModel):
    prompt: str

3. Warning Configuration and FastAPI App Initialization

Suppresses warnings and initializes a FastAPI application instance.

warnings.filterwarnings('ignore')
app = FastAPI()

4. Model Configuration and Initialization

The following lines set up the tokenizer and model for a text-generation task using a pre-trained model named "cognitivecomputations/dolphin-2.6-mistral-7b". The model and tokenizer are adapted to use efficient memory and processing configurations, suitable for various devices including the Apple M1 chip.

MODEL_NAME = "cognitivecomputations/dolphin-2.6-mistral-7b"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16,
    trust_remote_code=True, device_map="auto"
)

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 1024
generation_config.temperature = 0.0001
generation_config.top_p = 0.95
generation_config.do_sample = True
generation_config.repetition_penalty = 1.15

pipe_line = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
)

5. Setting Up Additional AI Components

This section integrates various components from the langchain library, including embeddings, document loaders, and retrieval chains, preparing the system for complex retrieval-based question answering.

llm = HuggingFacePipeline(
    pipeline=pipe_line,
)

embeddings = HuggingFaceEmbeddings(
    model_name="thenlper/gte-large",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

urls = [
    "https://davidepuglie.se/about/",
    "https://davidepuglie.se/blog/machine-learning-jupyter-docker-container"
]

loader = UnstructuredURLLoader(urls=urls)
documents = loader.load()

len(documents)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024, chunk_overlap=64)
texts_chunks = text_splitter.split_documents(documents)

len(texts_chunks)

db = Chroma.from_documents(texts_chunks, embeddings, persist_directory="db")

6. Conversation Management and Retrieval Chain

Defines a custom conversation template and sets up a retrieval chain using the previously initialized components. This chain manages conversation history and provides context-aware responses to user queries.

custom_template = """You are Davide Pugliese's Manager AI Assistant. Consider only the most relevant details from the following conversation to answer the standalone question. 
If the details provided do not give enough information to formulate a complete answer, respond with 'I am

 sorry, I don't have enough information'. Please answer in English.

Chat History:
{chat_history}

Standalone Question:
{question}
Please provide a concise and focused response based on the above details.
"""

CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template)

memory = ConversationBufferMemory(
    memory_key="chat_history", return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    memory=memory,
    condense_question_prompt=CUSTOM_QUESTION_PROMPT,
)

7. API Endpoints

Defines two API endpoints. The root endpoint simply returns a greeting, while the /generate-text endpoint handles text generation requests, using the conversational chain to generate and extract relevant responses.

@app.get("/")
async def read_root():
    return {"Hello": "World"}

@app.post("/generate-text")
async def generate_text(request: PromptRequest):
    try:
        response_ = qa_chain({"question": request.prompt})
        response = response_["answer"].strip()

        start_index = response.find("Helpful Answer:")
        extracted_text = response[start_index:]

        return {"response": extracted_text}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

8. Running the Application

Finally, the script configures the application to run on a local server using Uvicorn, specifying the application’s module and callable, along with network configurations.

if ___name__ == "__main__":
    uvicorn.run("rag:app", host="0.0.0.0", port=8085, reload=True)

This completes the code walkthrough, providing detailed insights into each component of the application. Each step is crafted to utilize AI-driven technologies for developing a robust conversational agent capable of understanding and responding to complex queries.

We can test the API with

curl localhost:8085/generate-text -H "Content-Type: application/json"  -X POST -d '{"prompt": "Who are you?"}'     

Which will produce a response similar to this

{"response":"Helpful Answer: I am a passionate hands-on software architect who creates bespoke solutions to empower businesses."} 

Here is the full code.

from pydantic import BaseModel
import torch
import gradio as gr
import uvicorn
from textwrap import fill
from IPython.display import Markdown, display

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from fastapi import FastAPI, HTTPException
from langchain_community.vectorstores import Chroma
from langchain.schema import AIMessage, HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain_core.prompts import PromptTemplate
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader, UnstructuredURLLoader
from langchain.chains import LLMChain, SimpleSequentialChain, RetrievalQA, ConversationalRetrievalChain

from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

import warnings


class PromptRequest(BaseModel):
    prompt: str


warnings.filterwarnings('ignore')

app = FastAPI()


# Adjusted model loading without BitsAndBytesConfig for M1 compatibility.
MODEL_NAME = "cognitivecomputations/dolphin-2.6-mistral-7b"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    # Kept torch_dtype for reduced model size.
    MODEL_NAME, torch_dtype=torch.float16,
    # Auto device mapping suitable for M1.
    trust_remote_code=True, device_map="auto"
)


generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 1024
generation_config.temperature = 0.0001
generation_config.top_p = 0.95
generation_config.do_sample = True
generation_config.repetition_penalty = 1.15

pipe_line = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
)

llm = HuggingFacePipeline(
    pipeline=pipe_line,
)

embeddings = HuggingFaceEmbeddings(
    model_name="thenlper/gte-large",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)


urls = [
    "https://davidepuglie.se/about/",
    "https://davidepuglie.se/blog/machine-learning-jupyter-docker-container"
]

loader = UnstructuredURLLoader(urls=urls)
documents = loader.load()

len(documents)


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024, chunk_overlap=64)
texts_chunks = text_splitter.split_documents(documents)

len(texts_chunks)

db = Chroma.from_documents(texts_chunks, embeddings, persist_directory="db")

custom_template = """You are Davide Pugliese's Manager AI Assistant. Consider only the most relevant details from the following conversation to answer the standalone question. 
If the details provided do not give enough information to formulate a complete answer, respond with 'I am sorry, I don't have enough information'. Please answer in English.

Chat History:
{chat_history}

Standalone Question:
{question}
Please provide a concise and focused response based on the above details.
"""

CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template)

memory = ConversationBufferMemory(
    memory_key="chat_history", return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    memory=memory,
    condense_question_prompt=CUSTOM_QUESTION_PROMPT,
)


@app.get("/")
async def read_root():
    return {"Hello": "World"}


@app.post("/generate-text")
async def generate_text(request: PromptRequest):
    try:
        # Load your model here or use a preloaded model
        response_ = qa_chain({"question": request.prompt})
        response = response_["answer"].strip()
        # Extract everything from "Helpful Answer:" to the end of the text

        start_index = response.find("Helpful Answer:")
        extracted_text = response[start_index:]

        return {"response": extracted_text}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    uvicorn.run("rag:app", host="0.0.0.0", port=8085, reload=True)