#3938 Software Hazard Report PDF Parser
#3938 Software Hazard Report PDF Parser
#3938 Provide a python application to parse a NASA software hazard report in PDF format, generating a CSV with: hazard report title, cause, control, verification, where each report has several causes, several controls per cause, and one or more verifications per control
This is a classic data extraction problem involving hierarchical data (1 Report --> Many Causes --> Many Controls --> Many Verifications) embedded in a flat PDF format.
Because NASA hazard reports vary (e.g., specific formats like text-heavy documents vs. tabular forms like NF 1707), the most robust approach is a State Machine Parser. This script reads the PDF line-by-line, detects keywords (like "Cause:", "Control:"), and maintains context to flatten the hierarchy into a CSV.
Prerequisites
You will need the pdfplumber library, which is superior to PyPDF2 for maintaining text layout and extracting data from forms.
pip install pdfplumber pandas
Python Application
This script includes a HazardParser class. I have designed the regex patterns (self.patterns) to be easily configurable, as the specific headers in your PDF might differ slightly (e.g., "Verification Method" vs. "Verification").
import pdfplumber
import re
import pandas as pd
import sys
class HazardReportParser:
def __init__(self, pdf_path):
self.pdf_path = pdf_path
self.data = []
# DEFINING PATTERNS
# Adjust these Regex patterns to match the specific headers in your NASA PDF.
# The keys below map to the hierarchy levels.
self.patterns = {
'title': re.compile(r'^(?:Hazard Report Name|Title|Subject):\s*(.*)', re.IGNORECASE),
'cause': re.compile(r'^(?:Cause|Initiating Mechanism)\s*[\d\.\-]*:\s*(.*)', re.IGNORECASE),
'control': re.compile(r'^(?:Control|Mitigation)\s*[\d\.\-]*:\s*(.*)', re.IGNORECASE),
'verification': re.compile(r'^(?:Verification|Method)\s*[\d\.\-]*:\s*(.*)', re.IGNORECASE),
}
def parse(self):
"""Main execution method to parse PDF and return a DataFrame."""
print(f"Opening {self.pdf_path}...")
try:
with pdfplumber.open(self.pdf_path) as pdf:
full_text = ""
# Extract text from all pages
for page in pdf.pages:
# layout=True attempts to mimic physical layout, often better for forms
page_text = page.extract_text(layout=True)
if page_text:
full_text += page_text + "\n"
self._process_text_stream(full_text)
except FileNotFoundError:
print("Error: File not found.")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
return self._generate_dataframe()
def _process_text_stream(self, text):
"""
Parses the text stream using a state machine approach.
It buffers multi-line descriptions until a new keyword is hit.
"""
lines = text.split('\n')
# State variables
current_title = "Unknown Report"
current_cause = "N/A"
current_control = "N/A"
# Buffers to hold multi-line text
buffer_type = None # Can be 'title', 'cause', 'control', 'verification'
text_buffer = []
def flush_buffer():
"""Updates the current state variables with the buffered text."""
nonlocal current_title, current_cause, current_control, buffer_type
joined_text = " ".join(text_buffer).strip()
if not joined_text:
return
if buffer_type == 'title':
current_title = joined_text
elif buffer_type == 'cause':
current_cause = joined_text
# Reset lower hierarchy when new cause starts
current_control = "N/A"
elif buffer_type == 'control':
current_control = joined_text
elif buffer_type == 'verification':
# When we finish a verification block, we commit the row
verification_text = joined_text
self.data.append({
'Report Title': current_title,
'Cause': current_cause,
'Control': current_control,
'Verification': verification_text
})
for line in lines:
line = line.strip()
if not line:
continue
# Check for Title
match_title = self.patterns['title'].match(line)
if match_title:
flush_buffer()
buffer_type = 'title'
text_buffer = [match_title.group(1)]
continue
# Check for Cause
match_cause = self.patterns['cause'].match(line)
if match_cause:
flush_buffer()
buffer_type = 'cause'
text_buffer = [match_cause.group(1)]
continue
# Check for Control
match_control = self.patterns['control'].match(line)
if match_control:
flush_buffer()
buffer_type = 'control'
text_buffer = [match_control.group(1)]
continue
# Check for Verification
match_ver = self.patterns['verification'].match(line)
if match_ver:
flush_buffer()
buffer_type = 'verification'
text_buffer = [match_ver.group(1)]
continue
# If no keyword matched, it's a continuation of the previous line
text_buffer.append(line)
# Flush any remaining data at the end of the file
flush_buffer()
def _generate_dataframe(self):
df = pd.DataFrame(self.data)
return df
# --- Usage Example ---
if __name__ == "__main__":
# 1. Set your PDF path
input_pdf = "hazard_report.pdf"
output_csv = "hazard_output.csv"
# 2. Run Parser
parser = HazardReportParser(input_pdf)
df = parser.parse()
if df is not None and not df.empty:
# 3. Save to CSV
df.to_csv(output_csv, index=False)
print(f"Success! Extracted {len(df)} rows.")
print(f"Saved to {output_csv}")
print(df.head())
else:
print("No data extracted. Please check the regex patterns against your PDF.")
How to adapt this for your specific PDF
Regex Patterns: Look at the
self.patternsdictionary. If your PDF labels "Causes" as "Hazard Causes 1.0", you must update the regex:'cause': re.compile(r'^Hazard Causes\s*[\d\.]*:\s*(.*)', re.IGNORECASE)Layout/Table Mode: The script uses
extract_text(layout=True). If your hazard report is a strict grid/table (lines drawn around every cell), text extraction can sometimes jumble columns. If this happens,pdfplumberhas a specificextract_tables()method that is more accurate for grids, but it requires a different logic loop (iterating over rows rather than text lines).
Logic Explanation
The script solves the hierarchy problem by denormalizing the data as it parses:
It remembers the "Current Cause" until it hits a new "Cause" line.
It remembers the "Current Control" until it hits a new "Control" line.
It writes a row only when it finishes processing a "Verification" block, attaching the currently active Cause and Control to that Verification.
Would you like me to adjust this to use extract_tables() instead, if your source documents are strictly formatted grid forms (like standard NASA Forms)?
From <https://gemini.google.com/app/> Google Gemini 3.0 Pro (Thinking)

Comments
Post a Comment