import tabula
import pandas as pd
# Set the coordinates of the table boundaries
top = 150
left = 60
right = 700
bottom = 900
# Read the PDF and extract tables
dfs = tabula.read_pdf(
"/Users/admin/Desktop/raw /Oil Lines/South/CL40/IP Data/2021/12P49SSP_FR.pdf",
# Trying out the first two pages only
pages='84',
guess=False,
lattice=True,
area=(top, left, right, bottom)
)
# Check if tables were extracted
if len(dfs) == 0:
print("No tables found in the specified area.")
else:
# Combine tables into a single DataFrame
combined_df = pd.concat(dfs)
column_names = ['log distance [m]', 'feature type', 'feature identification', 'comment','cl. id','joint number','weld type','length comp. [m]', 'ID [mm]','t [mm]','bend Y/N','to u/s weld [m]','clock pos.','surf. loc.','avg d [%]','d[%]','l [mm]','w [mm]','dim. class','ERF B31G','PB weldnr.','location class']
# ,'DIST_2014 [m]']
combined_df.columns = column_names
# Save the combined table as a CSV file
output_path = "/Users/admin/Desktop/Stages/Stage1/Oil Lines/South/CL40/PIPE_TALLY_2021.csv"
combined_df.to_csv(output_path, index=False)
print("2021 Combined table saved as CSV")
i keep changing the 'right' variable to be a huge number but i still cant get the last column in my pdf file. im not sure why, when i edit the column names i kept getting an error that it is only able to find 22 elements which is why i commented out the last column name but i dont really understand why it is unable to find it.