OiO.lk Community platform!

Oio.lk is an excellent forum for developers, providing a wide range of resources, discussions, and support for those in the developer community. Join oio.lk today to connect with like-minded professionals, share insights, and stay updated on the latest trends and technologies in the development field.
  You need to log in or register to access the solved answers to this problem.
  • You have reached the maximum number of guest views allowed
  • Please register below to remove this limitation

How to retrieve relevant Document of answer which was given by llm using Llamaindex?

  • Thread starter Thread starter JohnB
  • Start date Start date
J

JohnB

Guest
We are using pdf to load into our vector db and I am able to get answer from the llm but we have a requirement that we need to have the page content from the pdf.

Code:
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.readers.file import PyMuPDFReader
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from pathlib import Path
from llama_index.core import Settings
import os
import json
from pathlib import Path
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

query = 'What is the topic of the document?'  # Hardcoded query
apiKey = 'xxx'  # Hardcoded Azure OpenAI API key
apiVersion = '2023-07-01'  # Set the appropriate API version
azure_endpoint = 'xxx'  # Your Azure OpenAI endpoint

llm = AzureOpenAI(
    model="xxx",
    deployment_name="xxx",
    api_key=apiKey,
    azure_endpoint=azure_endpoint,
    api_version=apiVersion,
)

embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name="Embedding",
    api_key=apiKey,
    azure_endpoint=azure_endpoint,
    api_version=apiVersion,
)
# Define the LLM

Settings.llm = llm
Settings.embed_model = embed_model


# Load documents using PyMuPDFReader
docs0 = PyMuPDFReader().load(file_path=Path("sample2.pdf"))  # Replace with the correct path if downloading is needed
doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]

# Split documents into chunks
node_parser = SentenceSplitter(chunk_size=1024)
base_nodes = node_parser.get_nodes_from_documents(docs)

# Create the embedding function using Azure OpenAI

# Create the vector store index
index = VectorStoreIndex(base_nodes, embed_model)
retriever = index.as_retriever(search_type="similarity", search_kwargs={"k": 2})

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. You always HAVE TO say "thanks for asking!" at the end of the answer! 
{context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

# # Create a query engine
query_engine = index.as_query_engine()

# Get the answer
result = query_engine.query(query)

print(result)

We need reslut like this:

Code:
{
    "answer": " Parkinson's disease",
    "context": [
        {
            "name": "qna1718774871.255651\\10.1038_s41531-018-0058-0.pdf",
            "page": 6,
            "pageContent": "ADDITIONAL INFORMATION\nSupplementary information accompanies the paper on the npj Parkinson ’s..."
        },
    ],
    "status": [
        {
            "paper_id": "10.1038/s41531-018-0058-0",
            "status": "Success"
        }
    ]
}

I am using Python, so want to know if there's any library or anything which can be used to get this kind of a result.
<p>We are using pdf to load into our vector db and I am able to get answer from the llm but we have a requirement that we need to have the page content from the pdf.</p>
<pre><code>from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.readers.file import PyMuPDFReader
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from pathlib import Path
from llama_index.core import Settings
import os
import json
from pathlib import Path
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

query = 'What is the topic of the document?' # Hardcoded query
apiKey = 'xxx' # Hardcoded Azure OpenAI API key
apiVersion = '2023-07-01' # Set the appropriate API version
azure_endpoint = 'xxx' # Your Azure OpenAI endpoint

llm = AzureOpenAI(
model="xxx",
deployment_name="xxx",
api_key=apiKey,
azure_endpoint=azure_endpoint,
api_version=apiVersion,
)

embed_model = AzureOpenAIEmbedding(
model="text-embedding-ada-002",
deployment_name="Embedding",
api_key=apiKey,
azure_endpoint=azure_endpoint,
api_version=apiVersion,
)
# Define the LLM

Settings.llm = llm
Settings.embed_model = embed_model


# Load documents using PyMuPDFReader
docs0 = PyMuPDFReader().load(file_path=Path("sample2.pdf")) # Replace with the correct path if downloading is needed
doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]

# Split documents into chunks
node_parser = SentenceSplitter(chunk_size=1024)
base_nodes = node_parser.get_nodes_from_documents(docs)

# Create the embedding function using Azure OpenAI

# Create the vector store index
index = VectorStoreIndex(base_nodes, embed_model)
retriever = index.as_retriever(search_type="similarity", search_kwargs={"k": 2})

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. You always HAVE TO say "thanks for asking!" at the end of the answer!
{context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

# # Create a query engine
query_engine = index.as_query_engine()

# Get the answer
result = query_engine.query(query)

print(result)
</code></pre>
<p>We need reslut like this:</p>
<pre><code>{
"answer": " Parkinson's disease",
"context": [
{
"name": "qna1718774871.255651\\10.1038_s41531-018-0058-0.pdf",
"page": 6,
"pageContent": "ADDITIONAL INFORMATION\nSupplementary information accompanies the paper on the npj Parkinson ’s..."
},
],
"status": [
{
"paper_id": "10.1038/s41531-018-0058-0",
"status": "Success"
}
]
}
</code></pre>
<p>I am using Python, so want to know if there's any library or anything which can be used to get this kind of a result.</p>
 
Top