from pyhere import here
from IPython.display import display_markdown
from openai import OpenAI
import pymupdf
import io
import os
from PIL import Image
import base64
import json
= os.getenv("OPENAI_API_KEY")
api_key = OpenAI(api_key=api_key) client
2 Data Extraction (Part 1)
Data Extraction Workflow: https://cookbook.openai.com/examples/data_extraction_transformation
2.1 Extract PDF to base64 image
@staticmethod
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def pdf_to_base64_images(pdf_path):
# Handles PDFs with multiple pages
import pymupdf
= pymupdf.open(pdf_path)
pdf_document = []
base64_images = []
temp_image_paths
= len(pdf_document)
total_pages
for page_num in range(total_pages):
= pdf_document.load_page(page_num)
page = page.get_pixmap()
pix = Image.open(io.BytesIO(pix.tobytes()))
img = f"temp_page_{page_num}.png"
temp_image_path format="PNG")
img.save(temp_image_path,
temp_image_paths.append(temp_image_path)= encode_image(temp_image_path)
base64_image
base64_images.append(base64_image)
for temp_image_path in temp_image_paths:
os.remove(temp_image_path)
return base64_images
# Test Run
= pdf_to_base64_images(here("pdf/invoice/20190119_002.pdf")) img_base64_1
2.2 How To
2.2.1 Extract text from a PDF
import pymupdf
= pymupdf.open(here("pdf/invoice/20190119_002.pdf"))
doc = open("output.txt", "wb") # create a text output
out out
<_io.BufferedWriter name='output.txt'>
# Load Page
= doc.load_page(0)
page = page.get_pixmap()
pix = Image.open(io.BytesIO(pix.tobytes()))
img img
2.3 Extract one PDF
def extract_invoice_data(base64_image):
= f"""
system_prompt You are an OCR-like data extraction tool that extracts hotel invoice data from PDFs.
1. Please extract the data in this hotel invoice, grouping data according to theme/sub groups, and then output into JSON.
2. Please keep the keys and values of the JSON in the original language.
3. The type of data you might encounter in the invoice includes but is not limited to: hotel information, guest information, invoice information,
room charges, taxes, and total charges etc.
4. If the page contains no charge data, please output an empty JSON object and don't make up any data.
5. If there are blank data fields in the invoice, please include them as "null" values in the JSON object.
6. If there are tables in the invoice, capture all of the rows and columns in the JSON object.
Even if a column is blank, include it as a key in the JSON object with a null value.
7. If a row is blank denote missing fields with "null" values.
8. Don't interpolate or make up data.
9. Please maintain the table structure of the charges, i.e. capture all of the rows and columns in the JSON object.
"""
= client.chat.completions.create(
response ="gpt-4o",
model={ "type": "json_object" },
response_format=[
messages
{"role": "system",
"content": system_prompt
},
{"role": "user",
"content": [
"type": "text", "text": "extract the data in this hotel invoice and output into JSON "},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}", "detail": "high"}}
{
]
}
],=0.0,
temperature
)return response.choices[0].message.content
= extract_invoice_data(img_base64_1) img_base64_1_json
= json.loads(img_base64_1_json)
img_base64_1_dict "Hotel Information") img_base64_1_dict.get(
{'Name': 'MADISON Hotel GmbH',
'Address': 'Schaarsteinweg 4, 20459 Hamburg',
'Contact': {'Phone': '+49-40-37 666-0',
'Fax': '+49-40-37 666-137',
'Email': 'info@madisonhotel.de',
'Website': 'madisonhotel.de'},
'Geschäftsführer': 'Marlies Head, Thomas Kleinertz',
'AG': 'Hamburg HRB 47821',
'VAT': 'DE118 696 407',
'Bank': {'Name': 'HypoVereinsbank',
'BLZ': '200 300 00',
'Konto-Nr': '360 27 11',
'IBAN': 'DE48 2003 0000 0036 6027 11',
'BIC': 'HYVEDEMM300'}}
2.4 Extract All PDFs
Because invoice data can span multiple pages in a PDF, we’re going to produce JSON objects for each page in the invoice and then append them together. The final invoice extraction will be a single JSON file.
def extract_from_multiple_pages(base64_images, original_filename, output_directory):
= []
entire_invoice
for base64_image in base64_images:
= extract_invoice_data(base64_image)
invoice_json = json.loads(invoice_json)
invoice_data
entire_invoice.append(invoice_data)
# Ensure the output directory exists
=True)
os.makedirs(output_directory, exist_ok
# Construct the output file path
= os.path.join(output_directory, original_filename.replace('.pdf', '_extracted.json'))
output_filename
# Save the entire_invoice list as a JSON file
with open(output_filename, 'w', encoding='utf-8') as f:
=False, indent=4)
json.dump(entire_invoice, f, ensure_asciireturn output_filename
def main_extract(read_path, write_path):
for filename in os.listdir(read_path):
= os.path.join(read_path, filename)
file_path if os.path.isfile(file_path):
= pdf_to_base64_images(file_path)
base64_images extract_from_multiple_pages(base64_images, filename, write_path)
= here("pdf/invoice"), write_path = here("output/json/extracted_invoice_json")) main_extract(read_path
2.5 Data Lake of JSON
= []
invoices_list = here("output/json/extracted_invoice_json")
json_dir_path
for f in os.listdir(json_dir_path):
print(f)
with open(os.path.join(json_dir_path, f), 'r') as file:
= json.load(file)
invoice_dict
invoices_list.append(invoice_dict)
invoices_list
citadines-20190331_Invoice_extracted.json
20190119_002_extracted.json
20190202_THE MADISON HAMBURG_extracted.json
[[{'Rechnungsadresse': {'Name': 'APImeis\xadter Consulting GmbH',
'Adresse': 'Friedrichstrasse 123',
'Stadt': '10117 Berlin',
'Land': 'Germany'},
'Gastname': 'Herr Jens Walter',
'Reservierungsnr.': '1221737070-1',
'Apartment Nr.': '420',
'Apartment Typ': '1-Bedroom',
'Anreisedatum': '31/03/2019',
'Abreisedatum': '05/04/2019',
'Rechnungsnr.': '083-72561',
'Rechnungsdatum': '05/04/19 - 09:46',
'Bediener': 'DE-GB',
'Seite': '1 von 1',
'Tabelle': [{'DATUM': '31/03/19',
'APT NR.': '420',
'BESCHREIBUNG': 'Apartment 31/03/19#420',
'EURO': '92,56'},
{'DATUM': '01/04/19',
'APT NR.': '420',
'BESCHREIBUNG': 'Apartment 01/04/19#420',
'EURO': '197,20'},
{'DATUM': '02/04/19',
'APT NR.': '420',
'BESCHREIBUNG': 'Apartment 02/04/19#420',
'EURO': '254,15'},
{'DATUM': '03/04/19',
'APT NR.': '420',
'BESCHREIBUNG': 'Apartment 03/04/19#420',
'EURO': '215,90'},
{'DATUM': '04/04/19',
'APT NR.': '420',
'BESCHREIBUNG': 'Apartment 04/04/19#420',
'EURO': '143,82'},
{'DATUM': '31/03/19',
'APT NR.': '420',
'BESCHREIBUNG': 'Mastercard bnr 8324 SPLIT',
'EURO': '-903,63'}],
'Summe Belastungen': '903,63',
'Summe Zahlungen': '-903,63',
'offener Betrag': '0,00',
'MwSt.': [{'MwSt. %': '19.00 %',
'NETTO €': '0,00',
'MwSt €': '0,00',
'BRUTTO €': '0,00'},
{'MwSt. %': '7.00 %',
'NETTO €': '844,51',
'MwSt €': '59,12',
'BRUTTO €': '903,63'},
{'MwSt. %': '0.00 %',
'NETTO €': '0,00',
'MwSt €': '0,00',
'BRUTTO €': '0,00'}]},
{}],
[{'Hotel Information': {'Name': 'MADISON Hotel GmbH',
'Address': 'Schaarsteinweg 4, 20459 Hamburg',
'Contact': {'Phone': '+49-40-37 666-0',
'Fax': '+49-40-37 666-137',
'Email': 'info@madisonhotel.de',
'Website': 'madisonhotel.de'},
'Geschäftsführer': 'Marlies Head, Thomas Kleinertz',
'AG': 'Hamburg HRB 47881',
'VAT': 'DE118 696 407',
'Bank': {'Name': 'HypoVereinsbank',
'BLZ': '200 300 00',
'Konto-Nr': '360 27 11',
'IBAN': 'DE48 2003 0000 0036 6027 11',
'BIC': 'HYVEDEMM300'}},
'Guest Information': {'Company': 'APImeiseter Consulting GmbH',
'Address': 'Friedrichstr. 123, 10117 Berlin',
'Guest Name': 'Herr Jens Walter'},
'Invoice Information': {'Rechnungs-Nr.': '487155 /',
'Datum': '18.01.19',
'Zimmer': '417',
'Anreise': '13.01.19',
'Abreise': '18.01.19',
'Seite': '1 of 1',
'Benutzer ID': 'RIL'},
'Charges': [{'Datum': '13.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '14.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '15.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '16.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '17.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '18.01.19',
'Beschreibung': 'Mastercard IFC',
'Belastung': None,
'Entlastung': '550.00'}],
'Summary': {'Total inkl. MwSt.': '514.02',
'MwSt. 7%': '514.02',
'MwSt. EUR': '35.98',
'Brutto EUR': '550.00',
'Total': '550.00',
'Saldo': '0.00 EUR'},
'Finanzamt': {'Name': 'Hamburg Mitte', 'Steuernummer': '4874/101/1228'},
'Kreditkarten Details': {'Vertragsnummer': '154884832',
'Beleg Nr.': '21891',
'Kreditkartennummer': 'XXXXXXXXXXXX5052',
'Transaktionsbetrag': '550.00',
'Verfallsdatum': 'XX/XX',
'Genehmigter Betrag': '550.00',
'Terminal ID': '69294692',
'Genehmigungscode': '95769'}},
{}],
[{'Hotel Information': {'Name': 'MADISON Hotel GmbH',
'Address': 'Schaarsteinweg 4, 20459 Hamburg',
'Contact': {'Phone': '+49.40.37 666-0',
'Fax': '+49.40.37 666-137',
'Email': 'info@madisonhotel.de',
'Website': 'madisonhotel.de'}},
'Invoice Information': {'Rechnungs-Nr.': '487812/1',
'Datum': '25.01.19',
'Zimmer': '316',
'Anreise': '20.01.19',
'Abreise': '25.01.19',
'Seite': '1 of 1',
'Benutzer ID': 'POE'},
'Guest Information': {'Gastname': 'Herr Jens Walter'},
'Charges': [{'Datum': '20.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '21.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '22.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '23.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '24.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '25.01.19',
'Beschreibung': 'Mastercard IFC',
'Belastung': '550.00',
'Entlastung': None}],
'Summary': {'Total': '550.00', 'Saldo': '0.00 EUR'},
'Tax Details': {'Umsatzsteuer Detail': {'Netto EUR': '514.02',
'MwSt. EUR': '35.98',
'Brutto EUR': '550.00'},
'Total inkl. MwSt.': '550.00',
'MwSt. 7%': {'Netto EUR': '514.02', 'MwSt. EUR': '35.98'}},
'Payment Information': {'Finanzamt': 'Hamburg Mitte',
'Steuernummer': '487/4010/1228',
'Kreditkartendetails': {'Kartennummer': '154984832',
'Vertragspartner': '100000000000X5052',
'Verfallsdatum': 'XX/XX',
'Terminal ID': '92964892',
'Beleg Nr.': '22161',
'Transaktionsbetrag': '550.00',
'Genehmigter Betrag': '550.00',
'Genehmigungsnummer': '149460'}}},
{}]]
len(invoices_list)
3
invoices_list
[[{'Rechnungsadresse': {'Name': 'APImeis\xadter Consulting GmbH',
'Adresse': 'Friedrichstrasse 123',
'Stadt': '10117 Berlin',
'Land': 'Germany'},
'Gastname': 'Herr Jens Walter',
'Reservierungsnr.': '1221737070-1',
'Apartment Nr.': '420',
'Apartment Typ': '1-Bedroom',
'Anreisedatum': '31/03/2019',
'Abreisedatum': '05/04/2019',
'Rechnungsnr.': '083-72561',
'Rechnungsdatum': '05/04/19 - 09:46',
'Bediener': 'DE-GB',
'Seite': '1 von 1',
'Tabelle': [{'DATUM': '31/03/19',
'APT NR.': '420',
'BESCHREIBUNG': 'Apartment 31/03/19#420',
'EURO': '92,56'},
{'DATUM': '01/04/19',
'APT NR.': '420',
'BESCHREIBUNG': 'Apartment 01/04/19#420',
'EURO': '197,20'},
{'DATUM': '02/04/19',
'APT NR.': '420',
'BESCHREIBUNG': 'Apartment 02/04/19#420',
'EURO': '254,15'},
{'DATUM': '03/04/19',
'APT NR.': '420',
'BESCHREIBUNG': 'Apartment 03/04/19#420',
'EURO': '215,90'},
{'DATUM': '04/04/19',
'APT NR.': '420',
'BESCHREIBUNG': 'Apartment 04/04/19#420',
'EURO': '143,82'},
{'DATUM': '31/03/19',
'APT NR.': '420',
'BESCHREIBUNG': 'Mastercard bnr 8324 SPLIT',
'EURO': '-903,63'}],
'Summe Belastungen': '903,63',
'Summe Zahlungen': '-903,63',
'offener Betrag': '0,00',
'MwSt.': [{'MwSt. %': '19.00 %',
'NETTO €': '0,00',
'MwSt €': '0,00',
'BRUTTO €': '0,00'},
{'MwSt. %': '7.00 %',
'NETTO €': '844,51',
'MwSt €': '59,12',
'BRUTTO €': '903,63'},
{'MwSt. %': '0.00 %',
'NETTO €': '0,00',
'MwSt €': '0,00',
'BRUTTO €': '0,00'}]},
{}],
[{'Hotel Information': {'Name': 'MADISON Hotel GmbH',
'Address': 'Schaarsteinweg 4, 20459 Hamburg',
'Contact': {'Phone': '+49-40-37 666-0',
'Fax': '+49-40-37 666-137',
'Email': 'info@madisonhotel.de',
'Website': 'madisonhotel.de'},
'Geschäftsführer': 'Marlies Head, Thomas Kleinertz',
'AG': 'Hamburg HRB 47881',
'VAT': 'DE118 696 407',
'Bank': {'Name': 'HypoVereinsbank',
'BLZ': '200 300 00',
'Konto-Nr': '360 27 11',
'IBAN': 'DE48 2003 0000 0036 6027 11',
'BIC': 'HYVEDEMM300'}},
'Guest Information': {'Company': 'APImeiseter Consulting GmbH',
'Address': 'Friedrichstr. 123, 10117 Berlin',
'Guest Name': 'Herr Jens Walter'},
'Invoice Information': {'Rechnungs-Nr.': '487155 /',
'Datum': '18.01.19',
'Zimmer': '417',
'Anreise': '13.01.19',
'Abreise': '18.01.19',
'Seite': '1 of 1',
'Benutzer ID': 'RIL'},
'Charges': [{'Datum': '13.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '14.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '15.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '16.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '17.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '18.01.19',
'Beschreibung': 'Mastercard IFC',
'Belastung': None,
'Entlastung': '550.00'}],
'Summary': {'Total inkl. MwSt.': '514.02',
'MwSt. 7%': '514.02',
'MwSt. EUR': '35.98',
'Brutto EUR': '550.00',
'Total': '550.00',
'Saldo': '0.00 EUR'},
'Finanzamt': {'Name': 'Hamburg Mitte', 'Steuernummer': '4874/101/1228'},
'Kreditkarten Details': {'Vertragsnummer': '154884832',
'Beleg Nr.': '21891',
'Kreditkartennummer': 'XXXXXXXXXXXX5052',
'Transaktionsbetrag': '550.00',
'Verfallsdatum': 'XX/XX',
'Genehmigter Betrag': '550.00',
'Terminal ID': '69294692',
'Genehmigungscode': '95769'}},
{}],
[{'Hotel Information': {'Name': 'MADISON Hotel GmbH',
'Address': 'Schaarsteinweg 4, 20459 Hamburg',
'Contact': {'Phone': '+49.40.37 666-0',
'Fax': '+49.40.37 666-137',
'Email': 'info@madisonhotel.de',
'Website': 'madisonhotel.de'}},
'Invoice Information': {'Rechnungs-Nr.': '487812/1',
'Datum': '25.01.19',
'Zimmer': '316',
'Anreise': '20.01.19',
'Abreise': '25.01.19',
'Seite': '1 of 1',
'Benutzer ID': 'POE'},
'Guest Information': {'Gastname': 'Herr Jens Walter'},
'Charges': [{'Datum': '20.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '21.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '22.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '23.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '24.01.19',
'Beschreibung': 'Übernachtung exklusive Frühstück',
'Belastung': '110.00',
'Entlastung': None},
{'Datum': '25.01.19',
'Beschreibung': 'Mastercard IFC',
'Belastung': '550.00',
'Entlastung': None}],
'Summary': {'Total': '550.00', 'Saldo': '0.00 EUR'},
'Tax Details': {'Umsatzsteuer Detail': {'Netto EUR': '514.02',
'MwSt. EUR': '35.98',
'Brutto EUR': '550.00'},
'Total inkl. MwSt.': '550.00',
'MwSt. 7%': {'Netto EUR': '514.02', 'MwSt. EUR': '35.98'}},
'Payment Information': {'Finanzamt': 'Hamburg Mitte',
'Steuernummer': '487/4010/1228',
'Kreditkartendetails': {'Kartennummer': '154984832',
'Vertragspartner': '100000000000X5052',
'Verfallsdatum': 'XX/XX',
'Terminal ID': '92964892',
'Beleg Nr.': '22161',
'Transaktionsbetrag': '550.00',
'Genehmigter Betrag': '550.00',
'Genehmigungsnummer': '149460'}}},
{}]]