#3806 Gemini 2.5 Pro Code: KG in SQLite3 with Document Embeddings

#3806 Create a python application that reads a CSV of entities and relationships to update a knowledge graph implemented in SQLite3. The SQLite3 knowledge graph should also support documents and document chunks as entities, with embeddings for each document chunk. Multi-hop [1..n] and [1..*] cypher queries must be supported.

This is a comprehensive request that requires a multi-part solution, including:

SQLite Database Schema Design for a knowledge graph including documents, chunks, and embeddings.
Cypher-like Query Support via custom SQL views or functions to handle multi-hop traversal.
Python Script for reading the CSV and populating the database.
Python Script for demonstration of multi-hop query.

Since a full working application with all the advanced features (like a complete Cypher parser within Python and a full embedding generation pipeline) is too complex for a single response, I'll provide a robust, simplified, and functional core structure that uses standard SQL and demonstrates the principles of implementing such a system in SQLite and Python.

1. SQLite Knowledge Graph Schema

We'll use a Property Graph Model (Nodes/Entities and Edges/Relationships) supplemented with tables for documents and embeddings.

Table	Purpose	Key Columns
Entity	Stores all nodes (e.g., Person, Concept, Document, Chunk).	`entity_id` (PK), `label` (e.g., 'Person', 'Document'), `name`
Relationship	Stores the edges between entities.	`source_id` (FK to `Entity`), `target_id` (FK to `Entity`), `type` (e.g., 'WORKS_FOR', 'HAS_CHUNK')
Document	Stores metadata for documents.	`doc_id` (FK to `Entity`), `title`, `text_content`
Chunk	Stores metadata for document chunks.	`chunk_id` (FK to `Entity`), `doc_id` (FK to `Document`), `text`
Embedding	Stores vector embeddings for document chunks.	`chunk_id` (FK to `Chunk`), `vector` (e.g., JSON or BLOB of float array)

2. Python Setup and Database Initialization

The following script initializes the database and creates the necessary tables.

import sqlite3
import pandas as pd
import json

DB_NAME = 'knowledge_graph.db'
CSV_PATH = 'data.csv'

def initialize_db():
    """Initializes the SQLite database schema."""
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()

    # Entities (Nodes)
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS Entity (
            entity_id INTEGER PRIMARY KEY,
            label TEXT NOT NULL,
            name TEXT NOT NULL UNIQUE
        );
    """)

    # Relationships (Edges)
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS Relationship (
            source_id INTEGER,
            target_id INTEGER,
            type TEXT NOT NULL,
            FOREIGN KEY (source_id) REFERENCES Entity(entity_id),
            FOREIGN KEY (target_id) REFERENCES Entity(entity_id),
            PRIMARY KEY (source_id, target_id, type)
        );
    """)

    # Document-Specific Tables
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS Document (
            doc_id INTEGER PRIMARY KEY,
            title TEXT,
            text_content TEXT,
            FOREIGN KEY (doc_id) REFERENCES Entity(entity_id)
        );
    """)

    # Chunk-Specific Tables
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS Chunk (
            chunk_id INTEGER PRIMARY KEY,
            doc_id INTEGER,
            text TEXT,
            FOREIGN KEY (chunk_id) REFERENCES Entity(entity_id),
            FOREIGN KEY (doc_id) REFERENCES Document(doc_id)
        );
    """)

    # Embedding Table (Using TEXT/JSON to simulate a vector array)
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS Embedding (
            chunk_id INTEGER PRIMARY KEY,
            vector TEXT, -- Storing as JSON string for simplicity, or BLOB for efficiency
            FOREIGN KEY (chunk_id) REFERENCES Chunk(chunk_id)
        );
    """)

    conn.commit()
    conn.close()
    print(f"Database '{DB_NAME}' initialized successfully.")

# Initialize the database before proceeding
initialize_db()

3. CSV Reading and Graph Population

Assume the input CSV, data.csv, has the following structure for entities and relationships:

source_name	source_label	relationship_type	target_name	target_label
Alice	Person	WORKS_FOR	Acme Corp	Company
Acme Corp	Company	LOCATED_IN	New York	City
Paper A	Document	HAS_AUTHOR	Alice	Person
Chunk 1.1	Chunk	CHUNK_OF	Paper A	Document

# --- Utility Functions for Graph Population ---

def get_or_create_entity(conn, name, label):
    """Inserts an entity if it doesn't exist and returns its ID."""
    cursor = conn.cursor()
    cursor.execute("SELECT entity_id FROM Entity WHERE name = ? AND label = ?", (name, label))
    result = cursor.fetchone()
    if result:
        return result[0]
    else:
        cursor.execute("INSERT INTO Entity (name, label) VALUES (?, ?)", (name, label))
        return cursor.lastrowid

def add_document_and_chunks(conn, doc_name, text_content, num_chunks=2):
    """Creates Document entity, Chunk entities, and HAS_CHUNK relationships."""
    doc_id = get_or_create_entity(conn, doc_name, 'Document')
    conn.cursor().execute("INSERT OR REPLACE INTO Document (doc_id, title, text_content) VALUES (?, ?, ?)", 
                         (doc_id, doc_name, text_content))

    # Simple chunking simulation
    chunk_size = len(text_content) // num_chunks
    for i in range(num_chunks):
        chunk_text = text_content[i * chunk_size : (i + 1) * chunk_size]
        chunk_name = f"{doc_name}_Chunk_{i+1}"
        chunk_id = get_or_create_entity(conn, chunk_name, 'Chunk')
        
        # Add to Chunk table
        conn.cursor().execute("INSERT OR REPLACE INTO Chunk (chunk_id, doc_id, text) VALUES (?, ?, ?)", 
                             (chunk_id, doc_id, chunk_text))
        
        # Add HAS_CHUNK relationship
        conn.cursor().execute("INSERT OR IGNORE INTO Relationship (source_id, target_id, type) VALUES (?, ?, ?)", 
                             (doc_id, chunk_id, 'HAS_CHUNK'))
        
        # Add a placeholder embedding (simulated vector)
        embedding_vector = json.dumps([i * 0.1, doc_id * 0.5, chunk_id * 0.2])
        conn.cursor().execute("INSERT OR REPLACE INTO Embedding (chunk_id, vector) VALUES (?, ?)", 
                             (chunk_id, embedding_vector))

# --- Main Population Function ---

def populate_graph_from_csv():
    """Reads the CSV and populates the Entity and Relationship tables."""
    # Create a dummy CSV for demonstration
    dummy_data = {
        'source_name': ['Alice', 'Acme Corp', 'Paper A', 'Bob', 'Alice'],
        'source_label': ['Person', 'Company', 'Document', 'Person', 'Person'],
        'relationship_type': ['WORKS_FOR', 'LOCATED_IN', 'HAS_AUTHOR', 'WORKS_FOR', 'KNOWS'],
        'target_name': ['Acme Corp', 'New York', 'Alice', 'Acme Corp', 'Bob'],
        'target_label': ['Company', 'City', 'Person', 'Company', 'Person']
    }
    df = pd.DataFrame(dummy_data)
    df.to_csv(CSV_PATH, index=False)
    print(f"\nDummy CSV created at '{CSV_PATH}'.")

    conn = sqlite3.connect(DB_NAME)
    
    try:
        # 1. Process relationships and entities
        for index, row in df.iterrows():
            source_id = get_or_create_entity(conn, row['source_name'], row['source_label'])
            target_id = get_or_create_entity(conn, row['target_name'], row['target_label'])

            conn.cursor().execute("INSERT OR IGNORE INTO Relationship (source_id, target_id, type) VALUES (?, ?, ?)",
                                 (source_id, target_id, row['relationship_type']))

        # 2. Add Document/Chunk data (This would typically happen during the CSV read if the data included documents)
        add_document_and_chunks(conn, 'Paper A', 
                                "The first half of the document discusses the structure of Acme Corp. "
                                "The second half details the corporate culture and work environment.")
        
        conn.commit()
        print("Graph populated with entities, relationships, documents, and chunks.")

    except Exception as e:
        print(f"An error occurred during population: {e}")
        conn.rollback()
    finally:
        conn.close()

populate_graph_from_csv()

4. Multi-Hop Query Support (Cypher-like Traversal)

SQLite doesn't natively support recursive path queries like Cypher, but we can simulate it using Recursive Common Table Expressions (RCTE). This allows us to perform multi-hop traversals like MATCH (a:Person)-[r*1..3]->(b) (find paths up to 3 hops from 'a').

Example: Find entities up to 3 hops away from 'Alice'

This SQL query simulates a Cypher-like query:

MATCH (p:Person {name: 'Alice'})-[r*1..3]-(n) RETURN n.name

def execute_multi_hop_query(start_entity_name, max_hops=3):
    """
    Executes a multi-hop traversal query using Recursive CTE.
    Finds all entities reachable within max_hops from the starting entity.
    """
    conn = sqlite3.connect(DB_NAME)
    
    # SQL using Recursive CTE to find paths (up to max_hops)
    # Note: We consider both outgoing and incoming relationships for traversal.
    rcte_query = f"""
    WITH RECURSIVE
        -- 1. Anchor: Start the traversal at the initial entity
        paths (entity_id, name, label, depth, path) AS (
            SELECT
                entity_id, name, label, 0, name
            FROM
                Entity
            WHERE
                name = ?
            
            UNION ALL
            
            -- 2. Recursive Part: Extend the path by one hop
            SELECT
                CASE WHEN r.source_id = p.entity_id THEN r.target_id ELSE r.source_id END,
                e.name,
                e.label,
                p.depth + 1,
                p.path || ' -> ' || e.name
            FROM
                paths p
            JOIN
                Relationship r ON r.source_id = p.entity_id OR r.target_id = p.entity_id
            JOIN
                Entity e ON e.entity_id = (CASE WHEN r.source_id = p.entity_id THEN r.target_id ELSE r.source_id END)
            WHERE
                p.depth < {max_hops}
        )
    -- 3. Final Selection: Select unique results, excluding the starting node
    SELECT DISTINCT
        name, label, depth, path
    FROM
        paths
    WHERE
        depth > 0
    ORDER BY
        depth, name;
    """
    
    print(f"\n--- Multi-Hop Query: Find all entities {max_hops} hops from '{start_entity_name}' ---")
    results = conn.execute(rcte_query, (start_entity_name,)).fetchall()
    
    if results:
        print(f"| {'Entity Name':<15} | {'Label':<10} | {'Hops':<5} | {'Path':<40} |")
        print("|" + "-"*17 + "|" + "-"*12 + "|" + "-"*7 + "|" + "-"*42 + "|")
        for name, label, depth, path in results:
            print(f"| {name:<15} | {label:<10} | {depth:<5} | {path:<40} |")
    else:
        print("No paths found.")

    conn.close()

# --- Execution of the Multi-Hop Query ---
execute_multi_hop_query('Alice', max_hops=3)

https://gemini.google.com/app/71880532a3b297ac

Form <https://gemini.google.com/app/> Google Gemini 2.5 Pro

Search This Blog

Learning from ChatGPT