RAG-CHATBOT WITH LANGRAPH

Uploading pdf

from langchain_community.document_loaders import PyPDFLoader

# Give your actual PDF file path here

loader = PyPDFLoader("/home/amit/Downloads/bk_ntgl_000033.pdf")

docs = loader.load()



Splitting pdf

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)

# Extract text from each document

all_texts = [doc.page_content for doc in docs]

# Split each document separately

split_texts = []

for text in all_texts:

    chunks = text_splitter.split_text(text)

    split_texts.extend(chunks)

print(f"Total chunks: {len(split_texts)}")

print(split_texts[:3])  # preview first 3 chunks


Embeddings 

==============

import google.generativeai as genai

# 1. Configure with your Gemini API key

genai.configure(api_key="AIzaSyAyQgCMHDCqu41WGqHFETlaHhJQ7p4rsk4")

# 2. Choose Gemini embedding model

MODEL = "models/text-embedding-004"   # best model for embeddings

# 4. Function to embed a list of chunks

def embed_chunks(chunks):

    embeddings = []

    for chunk in chunks:

        result = genai.embed_content(

            model=MODEL,

            content=chunk,

            task_type="semantic_similarity"

        )

        embeddings.append(result["embedding"])

    return embeddings

# 5. Run embeddings

embeddings = embed_chunks(chunks)

# 6. Show results

print("Total chunks:", len(embeddings))

print("Embedding dimension:", len(embeddings[0]))

print("\nFirst chunk embedding preview:")

print(embeddings[0][:10])   # show first 10 numbers




Comments

Popular posts from this blog

Pyramid Model lifestyle for Success

Future Plan-No dream is too big

Mine anger reason