Advanced PDF to Excel with documents and example code

This comment was posted to reddit on May 01, 2023 at 6:37 am and was deleted within 9 minutes.

import csv

import os

import PyPDF2

# Path to the PDF directory

pdf_dir = ''

# Path to the output CSV file

csv_file_path = '

# Loop through all PDF files in the directory

with open(csv_file_path, 'a', newline='', encoding='utf-8') as csv_file:

writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

for filename in os.listdir(pdf_dir):

if filename.endswith('.pdf'):

# Open the PDF file

pdf_file_path = os.path.join(pdf_dir, filename)

pdf_file = open(pdf_file_path, 'rb')

# Create a PDF reader object

pdf_reader = PyPDF2.PdfReader(pdf_file)

# Extract the text from the PDF file

for page in pdf_reader.pages:

lines = page.extract_text().split('\n')

# Skip the first 5 lines

for line in lines[5:]:

# Write the line to the CSV file

writer.writerow([line])

# Close the PDF file

pdf_file.close()