from pyhere import here
import os
import json
from openai import OpenAI
= os.getenv("OPENAI_API_KEY")
api_key = OpenAI(api_key=api_key) client
3 Data Extraction (Part 2)
Transforming data according to a schema
Data Extraction Workflow: https://cookbook.openai.com/examples/data_extraction_transformation
3.1 Schema
with open(here("input/json/invoice-schema.json"), 'r') as file:
= json.load(file)
invoice_schema
invoice_schema
[{'hotel_information': {'name': 'string',
'address': {'street': 'string',
'city': 'string',
'country': 'string',
'postal_code': 'string'},
'contact': {'phone': 'string',
'fax': 'string',
'email': 'string',
'website': 'string'}},
'guest_information': {'company': 'string',
'address': 'string',
'guest_name': 'string'},
'invoice_information': {'invoice_number': 'string',
'reservation_number': 'string',
'date': 'YYYY-MM-DD',
'room_number': 'string',
'check_in_date': 'YYYY-MM-DD',
'check_out_date': 'YYYY-MM-DD'},
'charges': [{'date': 'YYYY-MM-DD',
'description': 'string',
'charge': 'number',
'credit': 'number'}],
'totals_summary': {'currency': 'string',
'total_net': 'number',
'total_tax': 'number',
'total_gross': 'number',
'total_charge': 'number',
'total_credit': 'number',
'balance_due': 'number'},
'taxes': [{'tax_type': 'string',
'tax_rate': 'string',
'net_amount': 'number',
'tax_amount': 'number',
'gross_amount': 'number'}]}]
3.2 Transform
def transform_invoice_data(json_raw, json_schema):
= f"""
system_prompt You are a data transformation tool that takes in JSON data and a reference JSON schema, and outputs JSON data according to the schema.
Not all of the data in the input JSON will fit the schema, so you may need to omit some data or add null values to the output JSON.
Translate all data into English if not already in English.
Ensure values are formatted as specified in the schema (e.g. dates as YYYY-MM-DD).
Here is the schema:
{json_schema}
"""
= client.chat.completions.create(
response ="gpt-4o",
model={ "type": "json_object" },
response_format=[
messages
{"role": "system",
"content": system_prompt
},
{"role": "user",
"content": [
"type": "text", "text": f"Transform the following raw JSON data according to the provided schema. Ensure all data is in English and formatted as specified by values in the schema. Here is the raw JSON: {json_raw}"}
{
]
}
],=0.0,
temperature
)return json.loads(response.choices[0].message.content)
def main_transform(extracted_invoice_json_path, json_schema_path, save_path):
# Load the JSON schema
with open(json_schema_path, 'r', encoding='utf-8') as f:
= json.load(f)
json_schema
# Ensure the save directory exists
=True)
os.makedirs(save_path, exist_ok
# Process each JSON file in the extracted invoices directory
for filename in os.listdir(extracted_invoice_json_path):
if filename.endswith(".json"):
= os.path.join(extracted_invoice_json_path, filename)
file_path
# Load the extracted JSON
with open(file_path, 'r', encoding='utf-8') as f:
= json.load(f)
json_raw
# Transform the JSON data
= transform_invoice_data(json_raw, json_schema)
transformed_json
# Save the transformed JSON to the save directory
= f"transformed_{filename}"
transformed_filename = os.path.join(save_path, transformed_filename)
transformed_file_path with open(transformed_file_path, 'w', encoding='utf-8') as f:
=False, indent=2)
json.dump(transformed_json, f, ensure_ascii
3.3 Execute !
= here("output/json/extracted_invoice_json")
extracted_invoice_json_path = here("input/json/invoice-schema.json")
json_schema_path = here("output/json/transformed_invoice_json")
save_path
main_transform(extracted_invoice_json_path, json_schema_path, save_path)
with open(here("output/json/transformed_invoice_json/transformed_20190119_002_extracted.json"), 'r') as file:
= json.load(file)
invoice_out_1
invoice_out_1
{'hotel_information': {'name': 'MADISON Hotel GmbH',
'address': {'street': 'Schaarsteinweg 4',
'city': 'Hamburg',
'country': 'Germany',
'postal_code': '20459'},
'contact': {'phone': '+49-40-37 666-0',
'fax': '+49-40-37 666-137',
'email': 'info@madisonhotel.de',
'website': 'madisonhotel.de'}},
'guest_information': {'company': 'APImeiseter Consulting GmbH',
'address': 'Friedrichstr. 123, 10117 Berlin',
'guest_name': 'Mr. Jens Walter'},
'invoice_information': {'invoice_number': '487155 /',
'reservation_number': None,
'date': '2019-01-18',
'room_number': '417',
'check_in_date': '2019-01-13',
'check_out_date': '2019-01-18'},
'charges': [{'date': '2019-01-13',
'description': 'Overnight stay excluding breakfast',
'charge': 110.0,
'credit': 0.0},
{'date': '2019-01-14',
'description': 'Overnight stay excluding breakfast',
'charge': 110.0,
'credit': 0.0},
{'date': '2019-01-15',
'description': 'Overnight stay excluding breakfast',
'charge': 110.0,
'credit': 0.0},
{'date': '2019-01-16',
'description': 'Overnight stay excluding breakfast',
'charge': 110.0,
'credit': 0.0},
{'date': '2019-01-17',
'description': 'Overnight stay excluding breakfast',
'charge': 110.0,
'credit': 0.0},
{'date': '2019-01-18',
'description': 'Mastercard IFC',
'charge': 0.0,
'credit': 550.0}],
'totals_summary': {'currency': 'EUR',
'total_net': 514.02,
'total_tax': 35.98,
'total_gross': 550.0,
'total_charge': 550.0,
'total_credit': 550.0,
'balance_due': 0.0},
'taxes': [{'tax_type': 'VAT 7%',
'tax_rate': '7%',
'net_amount': 514.02,
'tax_amount': 35.98,
'gross_amount': 550.0}]}