RAG-CHATBOT WITH LANGRAPH
Uploading pdf
from langchain_community.document_loaders import PyPDFLoader
# Give your actual PDF file path here
loader = PyPDFLoader("/home/amit/Downloads/bk_ntgl_000033.pdf")
docs = loader.load()
Splitting pdf
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)
# Extract text from each document
all_texts = [doc.page_content for doc in docs]
# Split each document separately
split_texts = []
for text in all_texts:
chunks = text_splitter.split_text(text)
split_texts.extend(chunks)
print(f"Total chunks: {len(split_texts)}")
print(split_texts[:3]) # preview first 3 chunks
Embeddings
==============
import google.generativeai as genai
# 1. Configure with your Gemini API key
genai.configure(api_key="AIzaSyAyQgCMHDCqu41WGqHFETlaHhJQ7p4rsk4")
# 2. Choose Gemini embedding model
MODEL = "models/text-embedding-004" # best model for embeddings
# 4. Function to embed a list of chunks
def embed_chunks(chunks):
embeddings = []
for chunk in chunks:
result = genai.embed_content(
model=MODEL,
content=chunk,
task_type="semantic_similarity"
)
embeddings.append(result["embedding"])
return embeddings
# 5. Run embeddings
embeddings = embed_chunks(chunks)
# 6. Show results
print("Total chunks:", len(embeddings))
print("Embedding dimension:", len(embeddings[0]))
print("\nFirst chunk embedding preview:")
print(embeddings[0][:10]) # show first 10 numbers
Comments
Post a Comment