#3563 Google Gemini 2.5 Pro: PDF-to-KG+Embedding

#3563 Google Gemini 2.5 Pro: PDF-to-KG+Embedding

#3563 Provide python application to read a PDF, embed it, identify entities and relationships and create Cypher queries to generate a knowledge graph

Hello! I've put together a Python application that will help you build a knowledge graph from a PDF document. This script reads a PDF, uses natural language processing to identify key entities and their relationships, and then generates the corresponding Cypher queries to build a knowledge graph in a Neo4j database.

Here is the complete application, including the Python code and a list of required libraries.

import fitz  # PyMuPDF

import spacy

from spacy.matcher import Matcher

from spacy.tokens import Span

import networkx as nx

import matplotlib.pyplot as plt


def extract_text_from_pdf(pdf_path):

    """

    Extracts text from a PDF file.


    Args:

        pdf_path (str): The path to the PDF file.


    Returns:

        str: The extracted text from the PDF.

    """

    doc = fitz.open(pdf_path)

    text = ""

    for page in doc:

        text += page.get_text()

    doc.close()

    return text


def extract_entities_and_relationships(text):

    """

    Extracts named entities and relationships from text using spaCy.


    Args:

        text (str): The text to process.


    Returns:

        tuple: A tuple containing a list of entities and a list of relationships.

    """

    # Load the spaCy model

    # To download the model, run: python -m spacy download en_core_web_sm

    nlp = spacy.load("en_core_web_sm")

    doc = nlp(text)


    # Extract entities

    entities = []

    for ent in doc.ents:

        entities.append((ent.text, ent.label_))


    # Extract relationships

    relationships = []

    for sent in doc.sents:

        # A simple rule-based approach for relationship extraction

        # This can be expanded with more sophisticated rules

        for token in sent:

            if token.dep_ == "ROOT":

                subject = None

                obj = None

                for child in token.children:

                    if child.dep_ in ("nsubj", "nsubjpass"):

                        subject = child

                    elif child.dep_ == "dobj":

                        obj = child

                if subject and obj:

                    relationships.append((subject.text, token.text, obj.text))


    return entities, relationships


def generate_cypher_queries(entities, relationships):

    """

    Generates Cypher queries to create a knowledge graph.


    Args:

        entities (list): A list of entities (text, label).

        relationships (list): A list of relationships (subject, relation, object).


    Returns:

        list: A list of Cypher queries.

    """

    queries = []


    # Create nodes

    for entity, label in entities:

        # Sanitize entity text for use as a variable and in properties

        sanitized_entity = ''.join(e for e in entity if e.isalnum())

        queries.append(f"MERGE (n:{label} {{name: '{entity.replace(\"'\", \"\\'\")}'}})")


    # Create relationships

    for sub, rel, obj in relationships:

        queries.append(

            f"MATCH (a {{name: '{sub.replace(\"'\", \"\\'\")}'}}), (b {{name: '{obj.replace(\"'\", \"\\'\")}'}}) "

            f"MERGE (a)-[:{rel.upper()}]->(b)"

        )


    return queries


def visualize_graph(entities, relationships):

    """

    Visualizes the knowledge graph using networkx and matplotlib.


    Args:

        entities (list): A list of entities.

        relationships (list): A list of relationships.

    """

    G = nx.DiGraph()


    for entity, label in entities:

        G.add_node(entity, label=label)


    for sub, rel, obj in relationships:

        G.add_edge(sub, obj, label=rel)


    pos = nx.spring_layout(G, k=0.5, iterations=50)

    plt.figure(figsize=(12, 12))

    nx.draw(G, pos, with_labels=True, node_size=2500, node_color='skyblue', font_size=10, font_weight='bold')

    edge_labels = nx.get_edge_attributes(G, 'label')

    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

    plt.title("Knowledge Graph Visualization")

    plt.show()



if __name__ == "__main__":

    # --- Configuration ---

    # 1. Make sure you have a PDF file you want to analyze.

    #    Replace 'sample.pdf' with the path to your PDF.

    pdf_file_path = 'sample.pdf'


    # 2. Ensure you have a running Neo4j instance if you want to import the data.

    #    The script will generate the Cypher queries, which you can then run

    #    in the Neo4j Browser or using a Neo4j driver.


    # --- Execution ---

    print("1. Extracting text from PDF...")

    try:

        pdf_text = extract_text_from_pdf(pdf_file_path)

        print("   ...Text extraction complete.")

    except FileNotFoundError:

        print(f"Error: The file '{pdf_file_path}' was not found.")

        print("Please create a 'sample.pdf' file or update the 'pdf_file_path' variable.")

        exit()



    print("\n2. Extracting entities and relationships...")

    extracted_entities, extracted_relationships = extract_entities_and_relationships(pdf_text)

    print("   ...Entity and relationship extraction complete.")

    print("\n--- Extracted Entities ---")

    for ent, label in extracted_entities:

        print(f"  - {ent} ({label})")


    print("\n--- Extracted Relationships ---")

    for sub, rel, obj in extracted_relationships:

        print(f"  - {sub} --[{rel}]--> {obj}")



    print("\n3. Generating Cypher queries...")

    cypher_queries = generate_cypher_queries(extracted_entities, extracted_relationships)

    print("   ...Cypher query generation complete.")


    print("\n--- Generated Cypher Queries ---")

    for query in cypher_queries:

        print(query)


    print("\n4. Visualizing the knowledge graph...")

    visualize_graph(extracted_entities, extracted_relationships)

    print("   ...Visualization complete.")

requirements.txt

PyMuPDF

spacy

networkx

matplotlib

Setup and Execution Guide

  1. Install Dependencies: Make sure you have Python installed. Then, install the required libraries using pip:

    pip install -r requirements.txt
    
  2. Download spaCy Model: The application uses a pre-trained spaCy model for Natural Language Processing. Download it with the following command:

    python -m spacy download en_core_web_sm
    
  3. Provide a PDF: Place a PDF file named sample.pdf in the same directory as the Python script, or update the pdf_file_path variable in the script to point to your PDF file.

  4. Run the Script: Execute the Python script from your terminal:

    python your_script_name.py
    
  5. View the Output: The script will print the following to your console:

    • A list of extracted entities and their labels (e.g., PERSON, ORG, GPE).

    • A list of extracted relationships in a (Subject, Verb, Object) format.

    • A series of Cypher MERGE queries.

    • A pop-up window displaying a visualization of the knowledge graph.

  6. Import into Neo4j: Copy the generated Cypher queries and paste them into the Neo4j Browser to create your knowledge graph. The MERGE command ensures that you won't create duplicate nodes or relationships if you run the import process multiple times.

This application provides a solid foundation for your project. The relationship extraction is currently based on a simple rule and can be expanded for greater accuracy and complexity.

Let me know if you have any questions or would like to explore enhancements to the entity and relationship extraction logic!


How about providing embedding for the PDF?

Of course. I can definitely modify the application to include PDF embedding.

I will update the main Python script to generate embeddings for the text chunks, add the necessary library to the requirements.txt file, and update the "How to Run the Application" guide to reflect this new capability. Here are the updated artifacts.

I've updated the code and instructions. The application now generates a vector embedding for the entire text content of the PDF, which you could use for tasks like semantic search or document similarity. Let me know if you have any other questions!

import fitz  # PyMuPDF

import spacy

from spacy.matcher import Matcher

from spacy.tokens import Span

import networkx as nx

import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer

import numpy as np


def extract_text_from_pdf(pdf_path):

    """

    Extracts text from a PDF file.


    Args:

        pdf_path (str): The path to the PDF file.


    Returns:

        str: The extracted text from the PDF.

    """

    doc = fitz.open(pdf_path)

    text = ""

    for page in doc:

        text += page.get_text()

    doc.close()

    return text


def get_text_embeddings(text):

    """

    Generates embeddings for the given text.


    Args:

        text (str): The text to embed.


    Returns:

        numpy.ndarray: The embedding vector for the text.

    """

    model = SentenceTransformer('all-MiniLM-L6-v2')

    embedding = model.encode(text)

    return embedding


def extract_entities_and_relationships(text):

    """

    Extracts named entities and relationships from text using spaCy.


    Args:

        text (str): The text to process.


    Returns:

        tuple: A tuple containing a list of entities and a list of relationships.

    """

    # Load the spaCy model

    # To download the model, run: python -m spacy download en_core_web_sm

    nlp = spacy.load("en_core_web_sm")

    doc = nlp(text)


    # Extract entities

    entities = []

    for ent in doc.ents:

        entities.append((ent.text, ent.label_))


    # Extract relationships

    relationships = []

    for sent in doc.sents:

        # A simple rule-based approach for relationship extraction

        # This can be expanded with more sophisticated rules

        for token in sent:

            if token.dep_ == "ROOT":

                subject = None

                obj = None

                for child in token.children:

                    if child.dep_ in ("nsubj", "nsubjpass"):

                        subject = child

                    elif child.dep_ == "dobj":

                        obj = child

                if subject and obj:

                    relationships.append((subject.text, token.text, obj.text))


    return entities, relationships


def generate_cypher_queries(entities, relationships):

    """

    Generates Cypher queries to create a knowledge graph.


    Args:

        entities (list): A list of entities (text, label).

        relationships (list): A list of relationships (subject, relation, object).


    Returns:

        list: A list of Cypher queries.

    """

    queries = []


    # Create nodes

    for entity, label in entities:

        # Sanitize entity text for use as a variable and in properties

        sanitized_entity = ''.join(e for e in entity if e.isalnum())

        queries.append(f"MERGE (n:{label} {{name: '{entity.replace(\"'\", \"\\'\")}'}})")


    # Create relationships

    for sub, rel, obj in relationships:

        queries.append(

            f"MATCH (a {{name: '{sub.replace(\"'\", \"\\'\")}'}}), (b {{name: '{obj.replace(\"'\", \"\\'\")}'}}) "

            f"MERGE (a)-[:{rel.upper()}]->(b)"

        )


    return queries


def visualize_graph(entities, relationships):

    """

    Visualizes the knowledge graph using networkx and matplotlib.


    Args:

        entities (list): A list of entities.

        relationships (list): A list of relationships.

    """

    G = nx.DiGraph()


    for entity, label in entities:

        G.add_node(entity, label=label)


    for sub, rel, obj in relationships:

        G.add_edge(sub, obj, label=rel)


    pos = nx.spring_layout(G, k=0.5, iterations=50)

    plt.figure(figsize=(12, 12))

    nx.draw(G, pos, with_labels=True, node_size=2500, node_color='skyblue', font_size=10, font_weight='bold')

    edge_labels = nx.get_edge_attributes(G, 'label')

    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

    plt.title("Knowledge Graph Visualization")

    plt.show()



if __name__ == "__main__":

    # --- Configuration ---

    # 1. Make sure you have a PDF file you want to analyze.

    #    Replace 'sample.pdf' with the path to your PDF.

    pdf_file_path = 'sample.pdf'


    # 2. Ensure you have a running Neo4j instance if you want to import the data.

    #    The script will generate the Cypher queries, which you can then run

    #    in the Neo4j Browser or using a Neo4j driver.


    # --- Execution ---

    print("1. Extracting text from PDF...")

    try:

        pdf_text = extract_text_from_pdf(pdf_file_path)

        print("   ...Text extraction complete.")

    except FileNotFoundError:

        print(f"Error: The file '{pdf_file_path}' was not found.")

        print("Please create a 'sample.pdf' file or update the 'pdf_file_path' variable.")

        exit()


    print("\n2. Generating PDF text embedding...")

    pdf_embedding = get_text_embeddings(pdf_text)

    print("   ...Embedding generation complete.")

    print(f"   Embedding vector shape: {pdf_embedding.shape}")

    # You can now use this 'pdf_embedding' vector for similarity searches or other ML tasks.

    # For demonstration, we'll just show a snippet.

    print(f"   Embedding snippet: {pdf_embedding[:5]}...")



    print("\n3. Extracting entities and relationships...")

    extracted_entities, extracted_relationships = extract_entities_and_relationships(pdf_text)

    print("   ...Entity and relationship extraction complete.")

    print("\n--- Extracted Entities ---")

    for ent, label in extracted_entities:

        print(f"  - {ent} ({label})")


    print("\n--- Extracted Relationships ---")

    for sub, rel, obj in extracted_relationships:

        print(f"  - {sub} --[{rel}]--> {obj}")



    print("\n4. Generating Cypher queries...")

    cypher_queries = generate_cypher_queries(extracted_entities, extracted_relationships)

    print("   ...Cypher query generation complete.")


    print("\n--- Generated Cypher Queries ---")

    for query in cypher_queries:

        print(query)


    print("\n5. Visualizing the knowledge graph...")

    visualize_graph(extracted_entities, extracted_relationships)

    print("   ...Visualization complete.")

requirements.txt

PyMuPDF
spacy
networkx
matplotlib
sentence-transformers

Setup and Execution Guide

  1. Install Dependencies: Make sure you have Python installed. Then, install the required libraries using pip:

    pip install -r requirements.txt
    
  2. Download spaCy Model: The application uses a pre-trained spaCy model for Natural Language Processing. Download it with the following command:

    python -m spacy download en_core_web_sm
    
  3. Provide a PDF: Place a PDF file named sample.pdf in the same directory as the Python script, or update the pdf_file_path variable in the script to point to your PDF file.

  4. Run the Script: Execute the Python script from your terminal:

    python your_script_name.py
    
  5. View the Output: The script will print the following to your console:

    • A confirmation that the PDF text embedding has been generated, along with the shape of the vector and a small snippet.

    • A list of extracted entities and their labels (e.g., PERSON, ORG, GPE).

    • A list of extracted relationships in a (Subject, Verb, Object) format.

    • A series of Cypher MERGE queries.

    • A pop-up window displaying a visualization of the knowledge graph.

  6. Import into Neo4j: Copy the generated Cypher queries and paste them into the Neo4j Browser to create your knowledge graph. The MERGE command ensures that you won't create duplicate nodes or relationships if you run the import process multiple times.


From <https://gemini.google.com/app/>  Google Gemini 2.5 Pro

Comments

Popular posts from this blog

#2892 Example of ChatGPT o1 Prompt

#3105 Federal and State Spending Idiocy

#3106 Google Gemini Advanced 2.0 Flash: NASA Software Process Requirements Checklist Template