#3310 Emedding NPR 7150.2D into an LLM to support COMPLETE and ACCURATE retrieval

#3310 I need python code to embed NASA NPR 7150.2D into an LLM. The embedding must support COMPLETE and ACCURATE retrieval of NPR 7150.2D information by section number, topic, SWE number and keyword(s).

Below is a self‐contained Python example that:

Loads and splits the NPR 7150.2D PDF into manageable chunks
Extracts metadata for each chunk:
- Section number (e.g. “5.2.1”)
- Section title/topic
- SWE references (e.g. “SWE-012”)
- A short list of extracted keywords
Embeds each chunk using OpenAI’s embeddings
Stores them in a Chroma vector‐store with that metadata
Provides simple retrieval functions by section number, topic, SWE number, or arbitrary keywords

# 1) Install dependencies (once):
#    pip install langchain openai chromadb

import re
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

def load_and_process_pdf(pdf_path: str):
    """Load NPR 7150.2D, split into chunks, and attach metadata."""
    # --- load and split raw pages ---
    loader = PyPDFLoader(pdf_path)
    raw_docs = loader.load_and_split()
    
    # --- further chunk into ~1 000‐char pieces with 200‐char overlap ---
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    docs = splitter.split_documents(raw_docs)
    
    # --- extract section/topic/SWE/keywords metadata ---
    for doc in docs:
        txt = doc.page_content
        
        # section number e.g. "5.2.1"
        sec_m = re.match(r'^(\d+(?:\.\d+)+)', txt)
        section = sec_m.group(1) if sec_m else None
        
        # section title after the number, up to end of line
        title_m = re.match(r'^\d+(?:\.\d+)+\s+(.+)$', txt.split('\n',1)[0])
        topic = title_m.group(1).strip() if title_m else None
        
        # all SWE tags like [SWE-012]
        swe_list = re.findall(r'\[SWE-(\d+)\]', txt)
        
        # simple keyword extraction (words ≥5 letters, first 10 unique)
        kws = re.findall(r'\b[A-Za-z]{5,}\b', txt)
        keywords = []
        for w in kws:
            if w.lower() not in [k.lower() for k in keywords]:
                keywords.append(w)
            if len(keywords) >= 10:
                break
        
        doc.metadata.update({
            "section": section,
            "topic": topic,
            "swe": swe_list,
            "keywords": keywords
        })
    return docs

def create_vectorstore(docs, persist_dir: str = "./nasa7150_vectorstore"):
    """Embed docs and write to Chroma on disk."""
    emb = OpenAIEmbeddings(model="text-embedding-ada-002")
    vs = Chroma.from_documents(docs, emb, persist_directory=persist_dir)
    vs.persist()
    return vs

# --- Retrieval helpers ---
def retrieve_by_section(vs, section_number: str, k: int = 5):
    return vs.similarity_search("", filter={"section": section_number}, k=k)

def retrieve_by_topic(vs, topic: str, k: int = 5):
    return vs.similarity_search("", filter={"topic": topic}, k=k)

def retrieve_by_swe(vs, swe_number: str, k: int = 5):
    # stored as list, so filter by membership
    return vs.similarity_search("", filter={"swe": [swe_number]}, k=k)

def retrieve_by_keyword(vs, keyword: str, k: int = 5):
    return vs.similarity_search(keyword, k=k)

# --- Example usage (run once) ---
if __name__ == "__main__":
    PDF_PATH = "nasa_npr_7150_2D.pdf"            # your local copy
    docs = load_and_process_pdf(PDF_PATH)
    vs   = create_vectorstore(docs)

    # Retrieve top‐5 chunks from Section 5.2
    res1 = retrieve_by_section(vs, "5.2")
    print("Section 5.2 →", [d.metadata for d in res1])

    # Retrieve by SWE 012
    res2 = retrieve_by_swe(vs, "012")
    print("SWE-012 →", [d.metadata for d in res2])

    # Retrieve by topic keyphrase
    res3 = retrieve_by_topic(vs, "Software Requirements")
    print("Topic Software Requirements →", [d.metadata for d in res3])

    # Free‐text keyword search
    res4 = retrieve_by_keyword(vs, "architecture")
    print("Keyword architecture →", [d.metadata for d in res4])

How it works

Document loading & splitting
We use LangChain’s PyPDFLoader to read the PDF and a recursive splitter to break it into ~1 000-character chunks (with overlaps to preserve context).
Metadata extraction
- Section: regex match at the start of each chunk
- Topic: the title immediately following the section number
- SWE list: all occurrences of [SWE-nnn]
- Keywords: the first ten unique words ≥5 letters
Embedding & storage
- OpenAI’s text-embedding-ada-002 generates 1 536-d vectors
- Chroma keeps vectors on disk for fast similarity search with metadata‐based filtering
Retrieval
- By section/topic/SWE: empty‐query similarity search plus a metadata filter
- By keyword: pure semantic similarity on the keyword

You can adapt chunk sizes, overlap, or keyword extraction as needed. This setup ensures complete and accurate lookups by any of the requested axes: section number, topic name, SWE reference, or arbitrary keywords.

How can I assure that ALL SWE's are retrieved when requested?

To guarantee you never “miss” a chunk that contains a given SWE tag, you need to treat SWE retrieval as a pure metadata lookup rather than a k-limited semantic search. Here are three patterns you can use:

1) Use Chroma’s metadata‐only query

Chroma’s Python API lets you ask “give me every document whose swe metadata list contains this value” without any k limit:

from langchain.docstore.document import Document

def retrieve_all_by_swe(vs, swe_number: str):
    """
    Metadata-only lookup: returns every chunk whose metadata['swe']
    contains swe_number, regardless of similarity or k.
    """
    # underlying Chroma Collection
    coll = vs._collection  

    # this returns dict with lists of all matching docs/metadatas
    results = coll.get(
        where={"swe": [swe_number]},
        include=["documents", "metadatas"]
    )

    # wrap into LangChain Documents
    docs = []
    for text, meta in zip(results["documents"], results["metadatas"]):
        docs.append(Document(page_content=text, metadata=meta))
    return docs

# usage
all_swe012 = retrieve_all_by_swe(vs, "012")
print(f"Found {len(all_swe012)} chunks for SWE-012")

Why it works:

The .get() call does no vector similarity ranking and ignores k; it simply returns all stored items whose metadata tag list includes "012".
As long as you extracted every SWE tag into the "swe" field at index time, nothing can slip through.

2) Build an in-memory SWE→chunks index

If you’d rather have instant Python lookups (and you already have docs in memory after processing), you can build a dict:

# after load_and_process_pdf(), before embedding
swe_index: dict[str, list[Document]] = {}

for doc in docs:
    for swe in doc.metadata.get("swe", []):
        swe_index.setdefault(swe, []).append(doc)

def retrieve_all_by_swe_map(swe_number: str):
    """O(1) lookup from our precomputed map."""
    return swe_index.get(swe_number, [])

# usage
chunks = retrieve_all_by_swe_map("012")
print(f"Map lookup found {len(chunks)} chunks for SWE-012")

Why it works:

You never hit the vector store for SWE lookups, so you’re 100% relying on your own complete sweep of metadata at index time.
Great for small-to-medium collections where keeping everything in memory is acceptable.

3) Semantically rank after metadata filtering

You can even combine both approaches: first do a metadata‐only filter to get all candidates, then re-rank by similarity if you like:

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import numpy as np

emb = OpenAIEmbeddings()

def retrieve_and_rank_swe(vs, swe_number: str, query: str = ""):
    # 1) metadata filter
    raw = vs._collection.get(
        where={"swe": [swe_number]},
        include=["documents", "ids", "metadatas"]
    )
    texts, ids, metas = raw["documents"], raw["ids"], raw["metadatas"]

    # 2) embed your query (or empty string)
    qv = emb.embed_query(query)

    # 3) get stored embeddings
    rows = vs._collection.get(
        where={"id": ids},
        include=["embeddings"]
    )
    evs = rows["embeddings"]

    # 4) cosine similarity
    def cos(a,b): return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
    sims = [cos(qv, e) for e in evs]

    # 5) sort descending
    idx_sorted = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)

    # 6) wrap back into Documents
    from langchain.docstore.document import Document
    docs = [Document(page_content=texts[i], metadata=metas[i]) for i in idx_sorted]
    return docs

# usage: rank by “performance” keyword inside SWE-012 docs
ranked = retrieve_and_rank_swe(vs, "012", query="performance")

Key takeaways

Don’t rely on k-limited similarity_search for tag lookups.
Use metadata-only queries (.get(where={…})) to fetch all matching items.
Optionally, post-filter or rank the full set by vector similarity if you need semantic ordering.

With one of these patterns in place, you can be confident that every chunk tagged with your SWE number will be retrieved.

So it appears that I do not actually have to embed the document into the LLM, but simply parse and add relevant data to a python construct

Exactly — for exact lookups (by section, SWE, topic or keyword) you don’t need vector embeddings at all. You can simply parse the PDF into chunks, extract your metadata, and store everything in plain Python data structures (e.g. dicts) that let you do O(1) or O(log n) lookups.

Below is a minimal end-to-end example showing:

Parsing NPR 7150.2D into chunks
Extracting metadata (section, topic, SWE, keywords)
Indexing into several lookup tables
Retrieval functions that return all matching chunks

import re
from collections import defaultdict
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

class NPRIndexer:
    def __init__(self, pdf_path: str):
        # Load & split
        loader = PyPDFLoader(pdf_path)
        raw_docs = loader.load_and_split()
        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        self.chunks = splitter.split_documents(raw_docs)

        # Indexes
        self.by_section = defaultdict(list)
        self.by_topic   = defaultdict(list)
        self.by_swe     = defaultdict(list)
        self.by_keyword = defaultdict(list)

        # Build them
        self._build_indexes()

    def _build_indexes(self):
        for chunk in self.chunks:
            text = chunk.page_content
            # 1) Section number
            m_sec = re.match(r'^(\d+(?:\.\d+)+)', text)
            section = m_sec.group(1) if m_sec else None

            # 2) Topic/title
            first_line = text.split('\n',1)[0]
            m_title = re.match(r'^\d+(?:\.\d+)+\s+(.+)$', first_line)
            topic = m_title.group(1).strip() if m_title else None

            # 3) SWE tags
            swes = re.findall(r'\[SWE-(\d+)\]', text)

            # 4) Keywords (first 10 unique ≥5 letters)
            kws = re.findall(r'\b[A-Za-z]{5,}\b', text)
            seen = set()
            keywords = []
            for w in kws:
                lw = w.lower()
                if lw not in seen:
                    seen.add(lw)
                    keywords.append(lw)
                if len(keywords) >= 10:
                    break

            # Store metadata on the chunk
            chunk.metadata.update({
                'section': section,
                'topic':   topic,
                'swe':     swes,
                'keywords': keywords
            })

            # Populate lookup tables
            if section:
                self.by_section[section].append(chunk)
            if topic:
                self.by_topic[topic].append(chunk)
            for swe in swes:
                self.by_swe[swe].append(chunk)
            for kw in keywords:
                self.by_keyword[kw].append(chunk)

    # Retrieval methods
    def get_by_section(self, section: str):
        return self.by_section.get(section, [])

    def get_by_topic(self, topic: str):
        return self.by_topic.get(topic, [])

    def get_by_swe(self, swe: str):
        return self.by_swe.get(swe, [])

    def get_by_keyword(self, keyword: str):
        return self.by_keyword.get(keyword.lower(), [])


if __name__ == '__main__':
    INDEX = NPRIndexer('nasa_npr_7150_2D.pdf')
    
    # Examples:
    sec_chunks = INDEX.get_by_section('5.2')
    print(f"Section 5.2 has {len(sec_chunks)} chunks")

    swe_chunks = INDEX.get_by_swe('012')
    print(f"SWE-012 appears in {len(swe_chunks)} chunks")

    kw_chunks = INDEX.get_by_keyword('architecture')
    print(f"Keyword “architecture” in {len(kw_chunks)} chunks")

Why this works

Deterministic indexing: every chunk’s metadata is parsed exactly once, and inserted into Python dicts keyed by section/SWE/etc.
100% coverage: since we index all chunks, a lookup like by_swe['012'] can never “miss” an entry.
Ultra-fast retrieval: dict lookups are O(1) on average, so you pay virtually nothing at query time.
No embeddings needed: you only use embeddings when you want semantic search (e.g. “find chunks about performance”). For exact tags/IDs, metadata alone is simpler, faster, and more reliable.

You can of course persist these dicts (via pickle or JSON) or wrap them in a tiny SQLite table if you need on-disk storage.

From <https://chatgpt.com/> ChatGPT 4o-mini-high

Search This Blog

Learning from ChatGPT