2  Data Extraction (Part 1)

Data Extraction Workflow: https://cookbook.openai.com/examples/data_extraction_transformation

from pyhere import here
from IPython.display import display_markdown

from openai import OpenAI
import pymupdf
import io
import os
from PIL import Image
import base64
import json

api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

2.1 Extract PDF to base64 image

@staticmethod
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")
def pdf_to_base64_images(pdf_path):
    # Handles PDFs with multiple pages
    import pymupdf
    
    pdf_document = pymupdf.open(pdf_path)
    base64_images = []
    temp_image_paths = []

    total_pages = len(pdf_document)

    for page_num in range(total_pages):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.open(io.BytesIO(pix.tobytes()))
        temp_image_path = f"temp_page_{page_num}.png"
        img.save(temp_image_path, format="PNG")
        temp_image_paths.append(temp_image_path)
        base64_image = encode_image(temp_image_path)
        base64_images.append(base64_image)

    for temp_image_path in temp_image_paths:
        os.remove(temp_image_path)

    return base64_images
# Test Run
img_base64_1 = pdf_to_base64_images(here("pdf/invoice/20190119_002.pdf"))

2.2 How To

2.2.1 Extract text from a PDF

PyMuPDF

import pymupdf

doc = pymupdf.open(here("pdf/invoice/20190119_002.pdf")) 
out = open("output.txt", "wb") # create a text output
out
<_io.BufferedWriter name='output.txt'>
# Load Page
page = doc.load_page(0)
pix = page.get_pixmap()
img = Image.open(io.BytesIO(pix.tobytes()))
img

2.3 Extract one PDF

def extract_invoice_data(base64_image):
    system_prompt = f"""
    You are an OCR-like data extraction tool that extracts hotel invoice data from PDFs.
   
    1. Please extract the data in this hotel invoice, grouping data according to theme/sub groups, and then output into JSON.

    2. Please keep the keys and values of the JSON in the original language. 

    3. The type of data you might encounter in the invoice includes but is not limited to: hotel information, guest information, invoice information,
    room charges, taxes, and total charges etc. 

    4. If the page contains no charge data, please output an empty JSON object and don't make up any data.

    5. If there are blank data fields in the invoice, please include them as "null" values in the JSON object.
    
    6. If there are tables in the invoice, capture all of the rows and columns in the JSON object. 
    Even if a column is blank, include it as a key in the JSON object with a null value.
    
    7. If a row is blank denote missing fields with "null" values. 
    
    8. Don't interpolate or make up data.

    9. Please maintain the table structure of the charges, i.e. capture all of the rows and columns in the JSON object.

    """
    
    response = client.chat.completions.create(
        model="gpt-4o",
        response_format={ "type": "json_object" },
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "extract the data in this hotel invoice and output into JSON "},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}", "detail": "high"}}
                ]
            }
        ],
        temperature=0.0,
    )
    return response.choices[0].message.content
img_base64_1_json = extract_invoice_data(img_base64_1)
img_base64_1_dict = json.loads(img_base64_1_json) 
img_base64_1_dict.get("Hotel Information")
{'Name': 'MADISON Hotel GmbH',
 'Address': 'Schaarsteinweg 4, 20459 Hamburg',
 'Contact': {'Phone': '+49-40-37 666-0',
  'Fax': '+49-40-37 666-137',
  'Email': 'info@madisonhotel.de',
  'Website': 'madisonhotel.de'},
 'Geschäftsführer': 'Marlies Head, Thomas Kleinertz',
 'AG': 'Hamburg HRB 47821',
 'VAT': 'DE118 696 407',
 'Bank': {'Name': 'HypoVereinsbank',
  'BLZ': '200 300 00',
  'Konto-Nr': '360 27 11',
  'IBAN': 'DE48 2003 0000 0036 6027 11',
  'BIC': 'HYVEDEMM300'}}

2.4 Extract All PDFs

Because invoice data can span multiple pages in a PDF, we’re going to produce JSON objects for each page in the invoice and then append them together. The final invoice extraction will be a single JSON file.

def extract_from_multiple_pages(base64_images, original_filename, output_directory):
    entire_invoice = []

    for base64_image in base64_images:
        invoice_json = extract_invoice_data(base64_image)
        invoice_data = json.loads(invoice_json)
        entire_invoice.append(invoice_data)

    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)

    # Construct the output file path
    output_filename = os.path.join(output_directory, original_filename.replace('.pdf', '_extracted.json'))
    
    # Save the entire_invoice list as a JSON file
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(entire_invoice, f, ensure_ascii=False, indent=4)
    return output_filename


def main_extract(read_path, write_path):
    for filename in os.listdir(read_path):
        file_path = os.path.join(read_path, filename)
        if os.path.isfile(file_path):
            base64_images = pdf_to_base64_images(file_path)
            extract_from_multiple_pages(base64_images, filename, write_path)
main_extract(read_path = here("pdf/invoice"), write_path = here("output/json/extracted_invoice_json"))

2.5 Data Lake of JSON

invoices_list = []
json_dir_path = here("output/json/extracted_invoice_json")

for f in os.listdir(json_dir_path):
    print(f)
    with open(os.path.join(json_dir_path, f), 'r') as file:
        invoice_dict = json.load(file)
        invoices_list.append(invoice_dict)
        
invoices_list
citadines-20190331_Invoice_extracted.json
20190119_002_extracted.json
20190202_THE MADISON HAMBURG_extracted.json
[[{'Rechnungsadresse': {'Name': 'APImeis\xadter Consulting GmbH',
    'Adresse': 'Friedrichstrasse 123',
    'Stadt': '10117 Berlin',
    'Land': 'Germany'},
   'Gastname': 'Herr Jens Walter',
   'Reservierungsnr.': '1221737070-1',
   'Apartment Nr.': '420',
   'Apartment Typ': '1-Bedroom',
   'Anreisedatum': '31/03/2019',
   'Abreisedatum': '05/04/2019',
   'Rechnungsnr.': '083-72561',
   'Rechnungsdatum': '05/04/19 - 09:46',
   'Bediener': 'DE-GB',
   'Seite': '1 von 1',
   'Tabelle': [{'DATUM': '31/03/19',
     'APT NR.': '420',
     'BESCHREIBUNG': 'Apartment 31/03/19#420',
     'EURO': '92,56'},
    {'DATUM': '01/04/19',
     'APT NR.': '420',
     'BESCHREIBUNG': 'Apartment 01/04/19#420',
     'EURO': '197,20'},
    {'DATUM': '02/04/19',
     'APT NR.': '420',
     'BESCHREIBUNG': 'Apartment 02/04/19#420',
     'EURO': '254,15'},
    {'DATUM': '03/04/19',
     'APT NR.': '420',
     'BESCHREIBUNG': 'Apartment 03/04/19#420',
     'EURO': '215,90'},
    {'DATUM': '04/04/19',
     'APT NR.': '420',
     'BESCHREIBUNG': 'Apartment 04/04/19#420',
     'EURO': '143,82'},
    {'DATUM': '31/03/19',
     'APT NR.': '420',
     'BESCHREIBUNG': 'Mastercard bnr 8324 SPLIT',
     'EURO': '-903,63'}],
   'Summe Belastungen': '903,63',
   'Summe Zahlungen': '-903,63',
   'offener Betrag': '0,00',
   'MwSt.': [{'MwSt. %': '19.00 %',
     'NETTO €': '0,00',
     'MwSt €': '0,00',
     'BRUTTO €': '0,00'},
    {'MwSt. %': '7.00 %',
     'NETTO €': '844,51',
     'MwSt €': '59,12',
     'BRUTTO €': '903,63'},
    {'MwSt. %': '0.00 %',
     'NETTO €': '0,00',
     'MwSt €': '0,00',
     'BRUTTO €': '0,00'}]},
  {}],
 [{'Hotel Information': {'Name': 'MADISON Hotel GmbH',
    'Address': 'Schaarsteinweg 4, 20459 Hamburg',
    'Contact': {'Phone': '+49-40-37 666-0',
     'Fax': '+49-40-37 666-137',
     'Email': 'info@madisonhotel.de',
     'Website': 'madisonhotel.de'},
    'Geschäftsführer': 'Marlies Head, Thomas Kleinertz',
    'AG': 'Hamburg HRB 47881',
    'VAT': 'DE118 696 407',
    'Bank': {'Name': 'HypoVereinsbank',
     'BLZ': '200 300 00',
     'Konto-Nr': '360 27 11',
     'IBAN': 'DE48 2003 0000 0036 6027 11',
     'BIC': 'HYVEDEMM300'}},
   'Guest Information': {'Company': 'APImeiseter Consulting GmbH',
    'Address': 'Friedrichstr. 123, 10117 Berlin',
    'Guest Name': 'Herr Jens Walter'},
   'Invoice Information': {'Rechnungs-Nr.': '487155 /',
    'Datum': '18.01.19',
    'Zimmer': '417',
    'Anreise': '13.01.19',
    'Abreise': '18.01.19',
    'Seite': '1 of 1',
    'Benutzer ID': 'RIL'},
   'Charges': [{'Datum': '13.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '14.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '15.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '16.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '17.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '18.01.19',
     'Beschreibung': 'Mastercard IFC',
     'Belastung': None,
     'Entlastung': '550.00'}],
   'Summary': {'Total inkl. MwSt.': '514.02',
    'MwSt. 7%': '514.02',
    'MwSt. EUR': '35.98',
    'Brutto EUR': '550.00',
    'Total': '550.00',
    'Saldo': '0.00 EUR'},
   'Finanzamt': {'Name': 'Hamburg Mitte', 'Steuernummer': '4874/101/1228'},
   'Kreditkarten Details': {'Vertragsnummer': '154884832',
    'Beleg Nr.': '21891',
    'Kreditkartennummer': 'XXXXXXXXXXXX5052',
    'Transaktionsbetrag': '550.00',
    'Verfallsdatum': 'XX/XX',
    'Genehmigter Betrag': '550.00',
    'Terminal ID': '69294692',
    'Genehmigungscode': '95769'}},
  {}],
 [{'Hotel Information': {'Name': 'MADISON Hotel GmbH',
    'Address': 'Schaarsteinweg 4, 20459 Hamburg',
    'Contact': {'Phone': '+49.40.37 666-0',
     'Fax': '+49.40.37 666-137',
     'Email': 'info@madisonhotel.de',
     'Website': 'madisonhotel.de'}},
   'Invoice Information': {'Rechnungs-Nr.': '487812/1',
    'Datum': '25.01.19',
    'Zimmer': '316',
    'Anreise': '20.01.19',
    'Abreise': '25.01.19',
    'Seite': '1 of 1',
    'Benutzer ID': 'POE'},
   'Guest Information': {'Gastname': 'Herr Jens Walter'},
   'Charges': [{'Datum': '20.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '21.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '22.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '23.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '24.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '25.01.19',
     'Beschreibung': 'Mastercard IFC',
     'Belastung': '550.00',
     'Entlastung': None}],
   'Summary': {'Total': '550.00', 'Saldo': '0.00 EUR'},
   'Tax Details': {'Umsatzsteuer Detail': {'Netto EUR': '514.02',
     'MwSt. EUR': '35.98',
     'Brutto EUR': '550.00'},
    'Total inkl. MwSt.': '550.00',
    'MwSt. 7%': {'Netto EUR': '514.02', 'MwSt. EUR': '35.98'}},
   'Payment Information': {'Finanzamt': 'Hamburg Mitte',
    'Steuernummer': '487/4010/1228',
    'Kreditkartendetails': {'Kartennummer': '154984832',
     'Vertragspartner': '100000000000X5052',
     'Verfallsdatum': 'XX/XX',
     'Terminal ID': '92964892',
     'Beleg Nr.': '22161',
     'Transaktionsbetrag': '550.00',
     'Genehmigter Betrag': '550.00',
     'Genehmigungsnummer': '149460'}}},
  {}]]
len(invoices_list)
3
invoices_list
[[{'Rechnungsadresse': {'Name': 'APImeis\xadter Consulting GmbH',
    'Adresse': 'Friedrichstrasse 123',
    'Stadt': '10117 Berlin',
    'Land': 'Germany'},
   'Gastname': 'Herr Jens Walter',
   'Reservierungsnr.': '1221737070-1',
   'Apartment Nr.': '420',
   'Apartment Typ': '1-Bedroom',
   'Anreisedatum': '31/03/2019',
   'Abreisedatum': '05/04/2019',
   'Rechnungsnr.': '083-72561',
   'Rechnungsdatum': '05/04/19 - 09:46',
   'Bediener': 'DE-GB',
   'Seite': '1 von 1',
   'Tabelle': [{'DATUM': '31/03/19',
     'APT NR.': '420',
     'BESCHREIBUNG': 'Apartment 31/03/19#420',
     'EURO': '92,56'},
    {'DATUM': '01/04/19',
     'APT NR.': '420',
     'BESCHREIBUNG': 'Apartment 01/04/19#420',
     'EURO': '197,20'},
    {'DATUM': '02/04/19',
     'APT NR.': '420',
     'BESCHREIBUNG': 'Apartment 02/04/19#420',
     'EURO': '254,15'},
    {'DATUM': '03/04/19',
     'APT NR.': '420',
     'BESCHREIBUNG': 'Apartment 03/04/19#420',
     'EURO': '215,90'},
    {'DATUM': '04/04/19',
     'APT NR.': '420',
     'BESCHREIBUNG': 'Apartment 04/04/19#420',
     'EURO': '143,82'},
    {'DATUM': '31/03/19',
     'APT NR.': '420',
     'BESCHREIBUNG': 'Mastercard bnr 8324 SPLIT',
     'EURO': '-903,63'}],
   'Summe Belastungen': '903,63',
   'Summe Zahlungen': '-903,63',
   'offener Betrag': '0,00',
   'MwSt.': [{'MwSt. %': '19.00 %',
     'NETTO €': '0,00',
     'MwSt €': '0,00',
     'BRUTTO €': '0,00'},
    {'MwSt. %': '7.00 %',
     'NETTO €': '844,51',
     'MwSt €': '59,12',
     'BRUTTO €': '903,63'},
    {'MwSt. %': '0.00 %',
     'NETTO €': '0,00',
     'MwSt €': '0,00',
     'BRUTTO €': '0,00'}]},
  {}],
 [{'Hotel Information': {'Name': 'MADISON Hotel GmbH',
    'Address': 'Schaarsteinweg 4, 20459 Hamburg',
    'Contact': {'Phone': '+49-40-37 666-0',
     'Fax': '+49-40-37 666-137',
     'Email': 'info@madisonhotel.de',
     'Website': 'madisonhotel.de'},
    'Geschäftsführer': 'Marlies Head, Thomas Kleinertz',
    'AG': 'Hamburg HRB 47881',
    'VAT': 'DE118 696 407',
    'Bank': {'Name': 'HypoVereinsbank',
     'BLZ': '200 300 00',
     'Konto-Nr': '360 27 11',
     'IBAN': 'DE48 2003 0000 0036 6027 11',
     'BIC': 'HYVEDEMM300'}},
   'Guest Information': {'Company': 'APImeiseter Consulting GmbH',
    'Address': 'Friedrichstr. 123, 10117 Berlin',
    'Guest Name': 'Herr Jens Walter'},
   'Invoice Information': {'Rechnungs-Nr.': '487155 /',
    'Datum': '18.01.19',
    'Zimmer': '417',
    'Anreise': '13.01.19',
    'Abreise': '18.01.19',
    'Seite': '1 of 1',
    'Benutzer ID': 'RIL'},
   'Charges': [{'Datum': '13.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '14.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '15.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '16.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '17.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '18.01.19',
     'Beschreibung': 'Mastercard IFC',
     'Belastung': None,
     'Entlastung': '550.00'}],
   'Summary': {'Total inkl. MwSt.': '514.02',
    'MwSt. 7%': '514.02',
    'MwSt. EUR': '35.98',
    'Brutto EUR': '550.00',
    'Total': '550.00',
    'Saldo': '0.00 EUR'},
   'Finanzamt': {'Name': 'Hamburg Mitte', 'Steuernummer': '4874/101/1228'},
   'Kreditkarten Details': {'Vertragsnummer': '154884832',
    'Beleg Nr.': '21891',
    'Kreditkartennummer': 'XXXXXXXXXXXX5052',
    'Transaktionsbetrag': '550.00',
    'Verfallsdatum': 'XX/XX',
    'Genehmigter Betrag': '550.00',
    'Terminal ID': '69294692',
    'Genehmigungscode': '95769'}},
  {}],
 [{'Hotel Information': {'Name': 'MADISON Hotel GmbH',
    'Address': 'Schaarsteinweg 4, 20459 Hamburg',
    'Contact': {'Phone': '+49.40.37 666-0',
     'Fax': '+49.40.37 666-137',
     'Email': 'info@madisonhotel.de',
     'Website': 'madisonhotel.de'}},
   'Invoice Information': {'Rechnungs-Nr.': '487812/1',
    'Datum': '25.01.19',
    'Zimmer': '316',
    'Anreise': '20.01.19',
    'Abreise': '25.01.19',
    'Seite': '1 of 1',
    'Benutzer ID': 'POE'},
   'Guest Information': {'Gastname': 'Herr Jens Walter'},
   'Charges': [{'Datum': '20.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '21.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '22.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '23.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '24.01.19',
     'Beschreibung': 'Übernachtung exklusive Frühstück',
     'Belastung': '110.00',
     'Entlastung': None},
    {'Datum': '25.01.19',
     'Beschreibung': 'Mastercard IFC',
     'Belastung': '550.00',
     'Entlastung': None}],
   'Summary': {'Total': '550.00', 'Saldo': '0.00 EUR'},
   'Tax Details': {'Umsatzsteuer Detail': {'Netto EUR': '514.02',
     'MwSt. EUR': '35.98',
     'Brutto EUR': '550.00'},
    'Total inkl. MwSt.': '550.00',
    'MwSt. 7%': {'Netto EUR': '514.02', 'MwSt. EUR': '35.98'}},
   'Payment Information': {'Finanzamt': 'Hamburg Mitte',
    'Steuernummer': '487/4010/1228',
    'Kreditkartendetails': {'Kartennummer': '154984832',
     'Vertragspartner': '100000000000X5052',
     'Verfallsdatum': 'XX/XX',
     'Terminal ID': '92964892',
     'Beleg Nr.': '22161',
     'Transaktionsbetrag': '550.00',
     'Genehmigter Betrag': '550.00',
     'Genehmigungsnummer': '149460'}}},
  {}]]