#3806 Gemini 2.5 Pro Code: KG in SQLite3 with Document Embeddings
#3806 Gemini 2.5 Pro Code: KG in SQLite3 with Document Embeddings
#3806 Create a python application that reads a CSV of entities and relationships to update a knowledge graph implemented in SQLite3. The SQLite3 knowledge graph should also support documents and document chunks as entities, with embeddings for each document chunk. Multi-hop [1..n] and [1..*] cypher queries must be supported.
This is a comprehensive request that requires a multi-part solution, including:
SQLite Database Schema Design for a knowledge graph including documents, chunks, and embeddings.
Cypher-like Query Support via custom SQL views or functions to handle multi-hop traversal.
Python Script for reading the CSV and populating the database.
Python Script for demonstration of multi-hop query.
Since a full working application with all the advanced features (like a complete Cypher parser within Python and a full embedding generation pipeline) is too complex for a single response, I'll provide a robust, simplified, and functional core structure that uses standard SQL and demonstrates the principles of implementing such a system in SQLite and Python.
1. SQLite Knowledge Graph Schema
We'll use a Property Graph Model (Nodes/Entities and Edges/Relationships) supplemented with tables for documents and embeddings.
| Table | Purpose | Key Columns |
| Entity | Stores all nodes (e.g., Person, Concept, Document, Chunk). | entity_id (PK), label (e.g., 'Person', 'Document'), name |
| Relationship | Stores the edges between entities. | source_id (FK to Entity), target_id (FK to Entity), type (e.g., 'WORKS_FOR', 'HAS_CHUNK') |
| Document | Stores metadata for documents. | doc_id (FK to Entity), title, text_content |
| Chunk | Stores metadata for document chunks. | chunk_id (FK to Entity), doc_id (FK to Document), text |
| Embedding | Stores vector embeddings for document chunks. | chunk_id (FK to Chunk), vector (e.g., JSON or BLOB of float array) |
2. Python Setup and Database Initialization
The following script initializes the database and creates the necessary tables.
import sqlite3
import pandas as pd
import json
DB_NAME = 'knowledge_graph.db'
CSV_PATH = 'data.csv'
def initialize_db():
"""Initializes the SQLite database schema."""
conn = sqlite3.connect(DB_NAME)
cursor = conn.cursor()
# Entities (Nodes)
cursor.execute("""
CREATE TABLE IF NOT EXISTS Entity (
entity_id INTEGER PRIMARY KEY,
label TEXT NOT NULL,
name TEXT NOT NULL UNIQUE
);
""")
# Relationships (Edges)
cursor.execute("""
CREATE TABLE IF NOT EXISTS Relationship (
source_id INTEGER,
target_id INTEGER,
type TEXT NOT NULL,
FOREIGN KEY (source_id) REFERENCES Entity(entity_id),
FOREIGN KEY (target_id) REFERENCES Entity(entity_id),
PRIMARY KEY (source_id, target_id, type)
);
""")
# Document-Specific Tables
cursor.execute("""
CREATE TABLE IF NOT EXISTS Document (
doc_id INTEGER PRIMARY KEY,
title TEXT,
text_content TEXT,
FOREIGN KEY (doc_id) REFERENCES Entity(entity_id)
);
""")
# Chunk-Specific Tables
cursor.execute("""
CREATE TABLE IF NOT EXISTS Chunk (
chunk_id INTEGER PRIMARY KEY,
doc_id INTEGER,
text TEXT,
FOREIGN KEY (chunk_id) REFERENCES Entity(entity_id),
FOREIGN KEY (doc_id) REFERENCES Document(doc_id)
);
""")
# Embedding Table (Using TEXT/JSON to simulate a vector array)
cursor.execute("""
CREATE TABLE IF NOT EXISTS Embedding (
chunk_id INTEGER PRIMARY KEY,
vector TEXT, -- Storing as JSON string for simplicity, or BLOB for efficiency
FOREIGN KEY (chunk_id) REFERENCES Chunk(chunk_id)
);
""")
conn.commit()
conn.close()
print(f"Database '{DB_NAME}' initialized successfully.")
# Initialize the database before proceeding
initialize_db()
3. CSV Reading and Graph Population
Assume the input CSV, data.csv, has the following structure for entities and relationships:
| source_name | source_label | relationship_type | target_name | target_label |
| Alice | Person | WORKS_FOR | Acme Corp | Company |
| Acme Corp | Company | LOCATED_IN | New York | City |
| Paper A | Document | HAS_AUTHOR | Alice | Person |
| Chunk 1.1 | Chunk | CHUNK_OF | Paper A | Document |
# --- Utility Functions for Graph Population ---
def get_or_create_entity(conn, name, label):
"""Inserts an entity if it doesn't exist and returns its ID."""
cursor = conn.cursor()
cursor.execute("SELECT entity_id FROM Entity WHERE name = ? AND label = ?", (name, label))
result = cursor.fetchone()
if result:
return result[0]
else:
cursor.execute("INSERT INTO Entity (name, label) VALUES (?, ?)", (name, label))
return cursor.lastrowid
def add_document_and_chunks(conn, doc_name, text_content, num_chunks=2):
"""Creates Document entity, Chunk entities, and HAS_CHUNK relationships."""
doc_id = get_or_create_entity(conn, doc_name, 'Document')
conn.cursor().execute("INSERT OR REPLACE INTO Document (doc_id, title, text_content) VALUES (?, ?, ?)",
(doc_id, doc_name, text_content))
# Simple chunking simulation
chunk_size = len(text_content) // num_chunks
for i in range(num_chunks):
chunk_text = text_content[i * chunk_size : (i + 1) * chunk_size]
chunk_name = f"{doc_name}_Chunk_{i+1}"
chunk_id = get_or_create_entity(conn, chunk_name, 'Chunk')
# Add to Chunk table
conn.cursor().execute("INSERT OR REPLACE INTO Chunk (chunk_id, doc_id, text) VALUES (?, ?, ?)",
(chunk_id, doc_id, chunk_text))
# Add HAS_CHUNK relationship
conn.cursor().execute("INSERT OR IGNORE INTO Relationship (source_id, target_id, type) VALUES (?, ?, ?)",
(doc_id, chunk_id, 'HAS_CHUNK'))
# Add a placeholder embedding (simulated vector)
embedding_vector = json.dumps([i * 0.1, doc_id * 0.5, chunk_id * 0.2])
conn.cursor().execute("INSERT OR REPLACE INTO Embedding (chunk_id, vector) VALUES (?, ?)",
(chunk_id, embedding_vector))
# --- Main Population Function ---
def populate_graph_from_csv():
"""Reads the CSV and populates the Entity and Relationship tables."""
# Create a dummy CSV for demonstration
dummy_data = {
'source_name': ['Alice', 'Acme Corp', 'Paper A', 'Bob', 'Alice'],
'source_label': ['Person', 'Company', 'Document', 'Person', 'Person'],
'relationship_type': ['WORKS_FOR', 'LOCATED_IN', 'HAS_AUTHOR', 'WORKS_FOR', 'KNOWS'],
'target_name': ['Acme Corp', 'New York', 'Alice', 'Acme Corp', 'Bob'],
'target_label': ['Company', 'City', 'Person', 'Company', 'Person']
}
df = pd.DataFrame(dummy_data)
df.to_csv(CSV_PATH, index=False)
print(f"\nDummy CSV created at '{CSV_PATH}'.")
conn = sqlite3.connect(DB_NAME)
try:
# 1. Process relationships and entities
for index, row in df.iterrows():
source_id = get_or_create_entity(conn, row['source_name'], row['source_label'])
target_id = get_or_create_entity(conn, row['target_name'], row['target_label'])
conn.cursor().execute("INSERT OR IGNORE INTO Relationship (source_id, target_id, type) VALUES (?, ?, ?)",
(source_id, target_id, row['relationship_type']))
# 2. Add Document/Chunk data (This would typically happen during the CSV read if the data included documents)
add_document_and_chunks(conn, 'Paper A',
"The first half of the document discusses the structure of Acme Corp. "
"The second half details the corporate culture and work environment.")
conn.commit()
print("Graph populated with entities, relationships, documents, and chunks.")
except Exception as e:
print(f"An error occurred during population: {e}")
conn.rollback()
finally:
conn.close()
populate_graph_from_csv()
4. Multi-Hop Query Support (Cypher-like Traversal)
SQLite doesn't natively support recursive path queries like Cypher, but we can simulate it using Recursive Common Table Expressions (RCTE). This allows us to perform multi-hop traversals like MATCH (a:Person)-[r*1..3]->(b) (find paths up to 3 hops from 'a').
Example: Find entities up to 3 hops away from 'Alice'
This SQL query simulates a Cypher-like query:
MATCH (p:Person {name: 'Alice'})-[r*1..3]-(n) RETURN n.name
def execute_multi_hop_query(start_entity_name, max_hops=3):
"""
Executes a multi-hop traversal query using Recursive CTE.
Finds all entities reachable within max_hops from the starting entity.
"""
conn = sqlite3.connect(DB_NAME)
# SQL using Recursive CTE to find paths (up to max_hops)
# Note: We consider both outgoing and incoming relationships for traversal.
rcte_query = f"""
WITH RECURSIVE
-- 1. Anchor: Start the traversal at the initial entity
paths (entity_id, name, label, depth, path) AS (
SELECT
entity_id, name, label, 0, name
FROM
Entity
WHERE
name = ?
UNION ALL
-- 2. Recursive Part: Extend the path by one hop
SELECT
CASE WHEN r.source_id = p.entity_id THEN r.target_id ELSE r.source_id END,
e.name,
e.label,
p.depth + 1,
p.path || ' -> ' || e.name
FROM
paths p
JOIN
Relationship r ON r.source_id = p.entity_id OR r.target_id = p.entity_id
JOIN
Entity e ON e.entity_id = (CASE WHEN r.source_id = p.entity_id THEN r.target_id ELSE r.source_id END)
WHERE
p.depth < {max_hops}
)
-- 3. Final Selection: Select unique results, excluding the starting node
SELECT DISTINCT
name, label, depth, path
FROM
paths
WHERE
depth > 0
ORDER BY
depth, name;
"""
print(f"\n--- Multi-Hop Query: Find all entities {max_hops} hops from '{start_entity_name}' ---")
results = conn.execute(rcte_query, (start_entity_name,)).fetchall()
if results:
print(f"| {'Entity Name':<15} | {'Label':<10} | {'Hops':<5} | {'Path':<40} |")
print("|" + "-"*17 + "|" + "-"*12 + "|" + "-"*7 + "|" + "-"*42 + "|")
for name, label, depth, path in results:
print(f"| {name:<15} | {label:<10} | {depth:<5} | {path:<40} |")
else:
print("No paths found.")
conn.close()
# --- Execution of the Multi-Hop Query ---
execute_multi_hop_query('Alice', max_hops=3)https://gemini.google.com/app/71880532a3b297ac
Form <https://gemini.google.com/app/> Google Gemini 2.5 Pro
Comments
Post a Comment