Extract structured data from construction PDFs. Convert specifications, BOMs, schedules, and reports from PDF to Excel/CSV/JSON. Use OCR for scanned documents and pdfplumber for native PDFs.
数据来源:ClawHub。 在 ClawSkills 查看
选择你使用的 Agent
方法一:命令行安装(推荐)
推荐(无需提前安装 clawhub)
npx clawhub@latest --dir ~/.claude/skills install pdf-to-structured或使用 clawhub CLI(需提前安装)
clawhub --dir ~/.claude/skills install pdf-to-structured⚠️ 需要 Node.js 18+,没有 Node?请使用下方方法二直接下载 ZIP。 安装 Node.js →
方法二:手动下载安装(无需 Node)
下载 ZIP,解压后将文件夹放到以下路径,重启 Agent 即可:
安装路径
~/.claude/skills/pdf-to-structured/💡解压后将文件夹放到上方路径,重启 Agent 即可生效
--- name: "pdf-to-structured" description: "Extract structured data from construction PDFs. Convert specifications, BOMs, schedules, and reports from PDF to Excel/CSV/JSON. Use OCR for scanned documents and pdfplumber for native PDFs." ---
Based on DDC methodology (Chapter 2.4), this skill transforms unstructured PDF documents into structured formats suitable for analysis and integration. Construction projects generate vast amounts of PDF documentation - specifications, BOMs, schedules, and reports - that need to be extracted and processed.
Book Reference: "Преобразование данных в структурированную форму" / "Data Transformation to Structured Form"
> "Преобразование данных из неструктурированной в структурированную форму — это и искусство, и наука. Этот процесс часто занимает значительную часть работы инженера по обработке данных." > — DDC Book, Chapter 2.4
The conversion follows the ETL pattern:
import pdfplumber
import pandas as pd
# Extract table from PDF
with pdfplumber.open("construction_spec.pdf") as pdf:
page = pdf.pages[0]
table = page.extract_table()
df = pd.DataFrame(table[1:], columns=table[0])
df.to_excel("extracted_data.xlsx", index=False)
# Core libraries
pip install pdfplumber pandas openpyxl
# For scanned PDFs (OCR)
pip install pytesseract pdf2image
# Also install Tesseract OCR: https://github.com/tesseract-ocr/tesseract
# For advanced PDF operations
pip install pypdf
import pdfplumber
import pandas as pd
def extract_tables_from_pdf(pdf_path):
"""Extract all tables from a PDF file"""
all_tables = []
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages):
tables = page.extract_tables()
for table_num, table in enumerate(tables):
if table and len(table) > 1:
# First row as header
df = pd.DataFrame(table[1:], columns=table[0])
df['_page'] = page_num + 1
df['_table'] = table_num + 1
all_tables.append(df)
if all_tables:
return pd.concat(all_tables, ignore_index=True)
return pd.DataFrame()
# Usage
df = extract_tables_from_pdf("material_specification.pdf")
df.to_excel("materials.xlsx", index=False)
import pdfplumber
def extract_text_with_layout(pdf_path):
"""Extract text preserving layout structure"""
full_text = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
full_text.append(text)
return "\n\n--- Page Break ---\n\n".join(full_text)
# Usage
text = extract_text_with_layout("project_report.pdf")
with open("report_text.txt", "w", encoding="utf-8") as f:
f.write(text)
import pdfplumber
import pandas as pd
def extract_table_from_area(pdf_path, page_num, bbox):
"""
Extract table from specific area on page
Args:
pdf_path: Path to PDF file
page_num: Page number (0-indexed)
bbox: Bounding box (x0, top, x1, bottom) in points
"""
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[page_num]
cropped = page.within_bbox(bbox)
table = cropped.extract_table()
if table:
return pd.DataFrame(table[1:], columns=table[0])
return pd.DataFrame()
# Usage - extract table from specific area
# bbox format: (left, top, right, bottom) in points (1 inch = 72 points)
df = extract_table_from_area("drawing.pdf", 0, (50, 100, 550, 400))
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
def ocr_scanned_pdf(pdf_path, language='eng'):
"""
Extract text from scanned PDF using OCR
Args:
pdf_path: Path to scanned PDF
language: Tesseract language code (eng, deu, rus, etc.)
"""
# Convert PDF pages to images
images = convert_from_path(pdf_path, dpi=300)
extracted_text = []
for i, image in enumerate(images):
text = pytesseract.image_to_string(image, lang=language)
extracted_text.append({
'page': i + 1,
'text': text
})
return pd.DataFrame(extracted_text)
# Usage
df = ocr_scanned_pdf("scanned_specification.pdf", language='eng')
df.to_csv("ocr_results.csv", index=False)
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import cv2
import numpy as np
def ocr_table_from_scanned_pdf(pdf_path, page_num=0):
"""Extract table from scanned PDF using OCR with table detection"""
# Convert specific page to image
images = convert_from_path(pdf_path, first_page=page_num+1,
last_page=page_num+1, dpi=300)
image = np.array(images[0])
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
# Apply thresholding
_, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
# Extract text with table structure
custom_config = r'--oem 3 --psm 6'
text = pytesseract.image_to_string(gray, config=custom_config)
# Parse text into table structure
lines = text.strip().split('\n')
data = [line.split() for line in lines if line.strip()]
if data:
# Assume first row is header
df = pd.DataFrame(data[1:], columns=data[0] if len(data[0]) > 0 else None)
return df
return pd.DataFrame()
# Usage
df = ocr_table_from_scanned_pdf("scanned_bom.pdf")
print(df)
import pdfplumber
import pandas as pd
import re
def extract_bom_from_pdf(pdf_path):
"""Extract Bill of Materials from construction PDF"""
all_items = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
if not table or len(table) < 2:
continue
# Find header row (look for common BOM headers)
header_keywords = ['item', 'description', 'quantity', 'unit', 'material']
for i, row in enumerate(table):
if row and any(keyword in str(row).lower() for keyword in header_keywords):
# Found header, process remaining rows
headers = [str(h).strip() for h in row]
for data_row in table[i+1:]:
if data_row and any(cell for cell in data_row if cell):
item = dict(zip(headers, data_row))
all_items.append(item)
break
return pd.DataFrame(all_items)
# Usage
bom = extract_bom_from_pdf("project_bom.pdf")
bom.to_excel("bom_extracted.xlsx", index=False)
import pdfplumber
import pandas as pd
from datetime import datetime
def extract_schedule_from_pdf(pdf_path):
"""Extract project schedule/gantt data from PDF"""
with pdfplumber.open(pdf_path) as pdf:
all_tasks = []
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
...安装 Pdf To Structured 后,可以对 AI 说这些话来触发它
Help me get started with Pdf To Structured
Explains what Pdf To Structured does, walks through the setup, and runs a quick demo based on your current project
Use Pdf To Structured to extract structured data from construction PDFs
Invokes Pdf To Structured with the right parameters and returns the result directly in the conversation
What can I do with Pdf To Structured in my documents & notes workflow?
Lists the top use cases for Pdf To Structured, with example commands for each scenario
将技能文件夹放到 ~/.claude/skills/pdf-to-structured/ 目录(个人级,所有项目可用),或 .claude/skills/pdf-to-structured/(项目级)。重启 AI 客户端后,用 /pdf-to-structured 主动调用,或让 AI 根据上下文自动发现并使用。
Pdf To Structured 支持 Claude、Cursor、OpenClaw,可与这些 AI 平台无缝集成,扩展其能力。
Pdf To Structured 可免费安装使用。请查阅仓库了解许可证信息。
Extract structured data from construction PDFs. Convert specifications, BOMs, schedules, and reports from PDF to Excel/CSV/JSON. Use OCR for scanned documents and pdfplumber for native PDFs.
Pdf To Structured 属于「Documents & Notes」分类,该分类的技能帮助 AI 智能体在此领域执行专业任务。
Automate my documents & notes tasks using Pdf To Structured
Identifies repetitive steps in your workflow and sets up Pdf To Structured to handle them automatically