#3563 Google Gemini 2.5 Pro: PDF-to-KG+Embedding
#3563 Google Gemini 2.5 Pro: PDF-to-KG+Embedding
#3563 Provide python application to read a PDF, embed it, identify entities and relationships and create Cypher queries to generate a knowledge graph
Hello! I've put together a Python application that will help you build a knowledge graph from a PDF document. This script reads a PDF, uses natural language processing to identify key entities and their relationships, and then generates the corresponding Cypher queries to build a knowledge graph in a Neo4j database.
Here is the complete application, including the Python code and a list of required libraries.
import fitz # PyMuPDF
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
import networkx as nx
import matplotlib.pyplot as plt
def extract_text_from_pdf(pdf_path):
"""
Extracts text from a PDF file.
Args:
pdf_path (str): The path to the PDF file.
Returns:
str: The extracted text from the PDF.
"""
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
def extract_entities_and_relationships(text):
"""
Extracts named entities and relationships from text using spaCy.
Args:
text (str): The text to process.
Returns:
tuple: A tuple containing a list of entities and a list of relationships.
"""
# Load the spaCy model
# To download the model, run: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
# Extract entities
entities = []
for ent in doc.ents:
entities.append((ent.text, ent.label_))
# Extract relationships
relationships = []
for sent in doc.sents:
# A simple rule-based approach for relationship extraction
# This can be expanded with more sophisticated rules
for token in sent:
if token.dep_ == "ROOT":
subject = None
obj = None
for child in token.children:
if child.dep_ in ("nsubj", "nsubjpass"):
subject = child
elif child.dep_ == "dobj":
obj = child
if subject and obj:
relationships.append((subject.text, token.text, obj.text))
return entities, relationships
def generate_cypher_queries(entities, relationships):
"""
Generates Cypher queries to create a knowledge graph.
Args:
entities (list): A list of entities (text, label).
relationships (list): A list of relationships (subject, relation, object).
Returns:
list: A list of Cypher queries.
"""
queries = []
# Create nodes
for entity, label in entities:
# Sanitize entity text for use as a variable and in properties
sanitized_entity = ''.join(e for e in entity if e.isalnum())
queries.append(f"MERGE (n:{label} {{name: '{entity.replace(\"'\", \"\\'\")}'}})")
# Create relationships
for sub, rel, obj in relationships:
queries.append(
f"MATCH (a {{name: '{sub.replace(\"'\", \"\\'\")}'}}), (b {{name: '{obj.replace(\"'\", \"\\'\")}'}}) "
f"MERGE (a)-[:{rel.upper()}]->(b)"
)
return queries
def visualize_graph(entities, relationships):
"""
Visualizes the knowledge graph using networkx and matplotlib.
Args:
entities (list): A list of entities.
relationships (list): A list of relationships.
"""
G = nx.DiGraph()
for entity, label in entities:
G.add_node(entity, label=label)
for sub, rel, obj in relationships:
G.add_edge(sub, obj, label=rel)
pos = nx.spring_layout(G, k=0.5, iterations=50)
plt.figure(figsize=(12, 12))
nx.draw(G, pos, with_labels=True, node_size=2500, node_color='skyblue', font_size=10, font_weight='bold')
edge_labels = nx.get_edge_attributes(G, 'label')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
plt.title("Knowledge Graph Visualization")
plt.show()
if __name__ == "__main__":
# --- Configuration ---
# 1. Make sure you have a PDF file you want to analyze.
# Replace 'sample.pdf' with the path to your PDF.
pdf_file_path = 'sample.pdf'
# 2. Ensure you have a running Neo4j instance if you want to import the data.
# The script will generate the Cypher queries, which you can then run
# in the Neo4j Browser or using a Neo4j driver.
# --- Execution ---
print("1. Extracting text from PDF...")
try:
pdf_text = extract_text_from_pdf(pdf_file_path)
print(" ...Text extraction complete.")
except FileNotFoundError:
print(f"Error: The file '{pdf_file_path}' was not found.")
print("Please create a 'sample.pdf' file or update the 'pdf_file_path' variable.")
exit()
print("\n2. Extracting entities and relationships...")
extracted_entities, extracted_relationships = extract_entities_and_relationships(pdf_text)
print(" ...Entity and relationship extraction complete.")
print("\n--- Extracted Entities ---")
for ent, label in extracted_entities:
print(f" - {ent} ({label})")
print("\n--- Extracted Relationships ---")
for sub, rel, obj in extracted_relationships:
print(f" - {sub} --[{rel}]--> {obj}")
print("\n3. Generating Cypher queries...")
cypher_queries = generate_cypher_queries(extracted_entities, extracted_relationships)
print(" ...Cypher query generation complete.")
print("\n--- Generated Cypher Queries ---")
for query in cypher_queries:
print(query)
print("\n4. Visualizing the knowledge graph...")
visualize_graph(extracted_entities, extracted_relationships)
print(" ...Visualization complete.")
requirements.txt
PyMuPDF
spacy
networkx
matplotlib
Setup and Execution Guide
Install Dependencies: Make sure you have Python installed. Then, install the required libraries using pip:
pip install -r requirements.txt
Download spaCy Model: The application uses a pre-trained spaCy model for Natural Language Processing. Download it with the following command:
python -m spacy download en_core_web_sm
Provide a PDF: Place a PDF file named
sample.pdf
in the same directory as the Python script, or update thepdf_file_path
variable in the script to point to your PDF file.Run the Script: Execute the Python script from your terminal:
python your_script_name.py
View the Output: The script will print the following to your console:
A list of extracted entities and their labels (e.g., PERSON, ORG, GPE).
A list of extracted relationships in a (Subject, Verb, Object) format.
A series of Cypher
MERGE
queries.A pop-up window displaying a visualization of the knowledge graph.
Import into Neo4j: Copy the generated Cypher queries and paste them into the Neo4j Browser to create your knowledge graph. The
MERGE
command ensures that you won't create duplicate nodes or relationships if you run the import process multiple times.
This application provides a solid foundation for your project. The relationship extraction is currently based on a simple rule and can be expanded for greater accuracy and complexity.
Let me know if you have any questions or would like to explore enhancements to the entity and relationship extraction logic!
How about providing embedding for the PDF?
Of course. I can definitely modify the application to include PDF embedding.
I will update the main Python script to generate embeddings for the text chunks, add the necessary library to the requirements.txt
file, and update the "How to Run the Application" guide to reflect this new capability. Here are the updated artifacts.
I've updated the code and instructions. The application now generates a vector embedding for the entire text content of the PDF, which you could use for tasks like semantic search or document similarity. Let me know if you have any other questions!
import fitz # PyMuPDF
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
import networkx as nx
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
import numpy as np
def extract_text_from_pdf(pdf_path):
"""
Extracts text from a PDF file.
Args:
pdf_path (str): The path to the PDF file.
Returns:
str: The extracted text from the PDF.
"""
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
def get_text_embeddings(text):
"""
Generates embeddings for the given text.
Args:
text (str): The text to embed.
Returns:
numpy.ndarray: The embedding vector for the text.
"""
model = SentenceTransformer('all-MiniLM-L6-v2')
embedding = model.encode(text)
return embedding
def extract_entities_and_relationships(text):
"""
Extracts named entities and relationships from text using spaCy.
Args:
text (str): The text to process.
Returns:
tuple: A tuple containing a list of entities and a list of relationships.
"""
# Load the spaCy model
# To download the model, run: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
# Extract entities
entities = []
for ent in doc.ents:
entities.append((ent.text, ent.label_))
# Extract relationships
relationships = []
for sent in doc.sents:
# A simple rule-based approach for relationship extraction
# This can be expanded with more sophisticated rules
for token in sent:
if token.dep_ == "ROOT":
subject = None
obj = None
for child in token.children:
if child.dep_ in ("nsubj", "nsubjpass"):
subject = child
elif child.dep_ == "dobj":
obj = child
if subject and obj:
relationships.append((subject.text, token.text, obj.text))
return entities, relationships
def generate_cypher_queries(entities, relationships):
"""
Generates Cypher queries to create a knowledge graph.
Args:
entities (list): A list of entities (text, label).
relationships (list): A list of relationships (subject, relation, object).
Returns:
list: A list of Cypher queries.
"""
queries = []
# Create nodes
for entity, label in entities:
# Sanitize entity text for use as a variable and in properties
sanitized_entity = ''.join(e for e in entity if e.isalnum())
queries.append(f"MERGE (n:{label} {{name: '{entity.replace(\"'\", \"\\'\")}'}})")
# Create relationships
for sub, rel, obj in relationships:
queries.append(
f"MATCH (a {{name: '{sub.replace(\"'\", \"\\'\")}'}}), (b {{name: '{obj.replace(\"'\", \"\\'\")}'}}) "
f"MERGE (a)-[:{rel.upper()}]->(b)"
)
return queries
def visualize_graph(entities, relationships):
"""
Visualizes the knowledge graph using networkx and matplotlib.
Args:
entities (list): A list of entities.
relationships (list): A list of relationships.
"""
G = nx.DiGraph()
for entity, label in entities:
G.add_node(entity, label=label)
for sub, rel, obj in relationships:
G.add_edge(sub, obj, label=rel)
pos = nx.spring_layout(G, k=0.5, iterations=50)
plt.figure(figsize=(12, 12))
nx.draw(G, pos, with_labels=True, node_size=2500, node_color='skyblue', font_size=10, font_weight='bold')
edge_labels = nx.get_edge_attributes(G, 'label')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
plt.title("Knowledge Graph Visualization")
plt.show()
if __name__ == "__main__":
# --- Configuration ---
# 1. Make sure you have a PDF file you want to analyze.
# Replace 'sample.pdf' with the path to your PDF.
pdf_file_path = 'sample.pdf'
# 2. Ensure you have a running Neo4j instance if you want to import the data.
# The script will generate the Cypher queries, which you can then run
# in the Neo4j Browser or using a Neo4j driver.
# --- Execution ---
print("1. Extracting text from PDF...")
try:
pdf_text = extract_text_from_pdf(pdf_file_path)
print(" ...Text extraction complete.")
except FileNotFoundError:
print(f"Error: The file '{pdf_file_path}' was not found.")
print("Please create a 'sample.pdf' file or update the 'pdf_file_path' variable.")
exit()
print("\n2. Generating PDF text embedding...")
pdf_embedding = get_text_embeddings(pdf_text)
print(" ...Embedding generation complete.")
print(f" Embedding vector shape: {pdf_embedding.shape}")
# You can now use this 'pdf_embedding' vector for similarity searches or other ML tasks.
# For demonstration, we'll just show a snippet.
print(f" Embedding snippet: {pdf_embedding[:5]}...")
print("\n3. Extracting entities and relationships...")
extracted_entities, extracted_relationships = extract_entities_and_relationships(pdf_text)
print(" ...Entity and relationship extraction complete.")
print("\n--- Extracted Entities ---")
for ent, label in extracted_entities:
print(f" - {ent} ({label})")
print("\n--- Extracted Relationships ---")
for sub, rel, obj in extracted_relationships:
print(f" - {sub} --[{rel}]--> {obj}")
print("\n4. Generating Cypher queries...")
cypher_queries = generate_cypher_queries(extracted_entities, extracted_relationships)
print(" ...Cypher query generation complete.")
print("\n--- Generated Cypher Queries ---")
for query in cypher_queries:
print(query)
print("\n5. Visualizing the knowledge graph...")
visualize_graph(extracted_entities, extracted_relationships)
print(" ...Visualization complete.")
requirements.txt
Setup and Execution Guide
Install Dependencies: Make sure you have Python installed. Then, install the required libraries using pip:
pip install -r requirements.txt
Download spaCy Model: The application uses a pre-trained spaCy model for Natural Language Processing. Download it with the following command:
python -m spacy download en_core_web_sm
Provide a PDF: Place a PDF file named
sample.pdf
in the same directory as the Python script, or update thepdf_file_path
variable in the script to point to your PDF file.Run the Script: Execute the Python script from your terminal:
python your_script_name.py
View the Output: The script will print the following to your console:
A confirmation that the PDF text embedding has been generated, along with the shape of the vector and a small snippet.
A list of extracted entities and their labels (e.g., PERSON, ORG, GPE).
A list of extracted relationships in a (Subject, Verb, Object) format.
A series of Cypher
MERGE
queries.A pop-up window displaying a visualization of the knowledge graph.
Import into Neo4j: Copy the generated Cypher queries and paste them into the Neo4j Browser to create your knowledge graph. The
MERGE
command ensures that you won't create duplicate nodes or relationships if you run the import process multiple times.
From <https://gemini.google.com/app/> Google Gemini 2.5 Pro
Comments
Post a Comment