This notebook extracts data from yearly Almond Acreage Report PDFs found on the website of the Almond Board of California. Over the summer of 2021 I worked as an Assisant Economic Research intern at AgIS Capital where I manually had to copy and paste this data into Excel in order to do any sort of analysis. And then I said never again. As one might imagine, this was very monotonousness work and I started to wonder if I could write Python code which might make the process more effecient. While I have partly failled to acomplish this, 2013 and the years 2007 and lower would up being impossible to get data out of, I was able to create CSV's for 2021 through 2008. Furthermore as new Reports are released, I will be able to use my class ALMOND_PDF to create a new CSV file.
The order of the is document is:
Coverting PDF's CSV's
Combining these CSV's into one CSV for all years
#import modules
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import os
import PyPDF2
from itertools import chain
import pdfplumber
from functions import ALMOND_PDF
#Class to use to convert pdf's from the Almond Board
class ALMOND_PDF:
"""
Using pdfplumber, extract data from the almond acreage report.
Parameters
----------
year : int
The year of report you are looking for.
name_dict: dictionary
A list of values such as counties or varieties
n : int
The page number, default is 1.
verbose: boolean
Should function show extract info to debug?
"""
def __init__(self, year, name_dict, pdf_path, columns, n=1, verbose=False):
self.name_dict = name_dict
self.year= year
self.n = n
self.verbose=verbose
self.columns=columns
# Opening pdf and then reading with pdf plumber
with pdfplumber.open(pdf_path) as pdf:
self.rawdata = pdf.pages[n].extract_text().split('\n')
def clean_data(self, empty=''):
"""
The goal of the clean data function is to regularize the data, regardless of which pdf you start out with.
While some of the pdfs spit out data which each value in the list is a string of all values in a row, others
spit out a list which all entries are represented as a values in the list. What
we want is a list with where each value is an entry in the list.
"""
rawdata = self.rawdata
#Creating empty list and looping through data
l = []
for i in range(len(rawdata)):
a = rawdata[i]
# if the value is a string containing mulitple values, split and create list
z = a.split(' ')
try:
int(z[0])
if i>3:
z.pop(0)
# appending to list
l.append(z)
except:
l.append(z)
#using ittertools chain function turn this list containing list into just one list
rawdata = list(chain(*l))
self.rawdata = rawdata
# if verbose is true show data
if self.verbose ==True:
self.rawdata
def row_check(self):
"""
Takes data and checks for the row values searching for. Also fixes string names which may have been
sepperated into multiple values (e.g. "San Luis Obispo"). Prints the rows not found if verbose
"""
#define your variables
name_dict = self.name_dict
data = self.rawdata
verbose = self.verbose
#List to append values to
rowlist = []
i = -1
#loop through data to find values in the name_list and append them
while i < (len(data)-1):
i += 1
if (data[i] in name_dict):
rowlist.append(data[i])
#Check if there are missing values
Missing = list(set(name_dict) - set(rowlist))
if len(Missing) == 0:
print("No missing vaules!")
#If there are missing values begin your search (likely just names with multiple words)
else:
continu = "Y"
#prints out data if verbose
if verbose==True:
print("-------------")
while i < (len(data)-1):
i += 1
if (data[i] != ' '):
if (data[i] not in name_dict):
print(data[i])
print("-------------")
continu = input("Begin search for missing value Y or N?")
print(f"These values are missing: {Missing}")
while continu != "N":
#loop through missing values
for M in range(len(Missing)):
#Get just first word in the name
correct = Missing[M]
z = correct.split(' ')
error = z[0]
found = "N"
#loop through data
i = -1
while i < (len(data)-1):
i += 1
#If data value equals the first word of the missing value then:
try:
if (data[i].lower().capitalize() == z[0]):
try:
if data[i+1].lower().capitalize()==z[1]:
try:
if data[i+2].lower().capitalize()==z[2]:
data[i] = correct
data.pop(i+1)
data.pop(i+1)
found ="Y"
break
except:
data[i] = correct
data.pop(i+1)
found ="Y"
break
except:
data[i] = correct
found ="Y"
break
except:
pass
#If verbose equals true then show what values are missing
if verbose == True:
if found == "N":
error = print(f"Nothing Found for {Missing[M]}")
found= "Y"
continu = "N"
self.rawdata = data
def append_acres(self):
""""
Appends lists to the name_dict the corresponding the key.
"""
#define your variables
name_dict = self.name_dict
data = self.rawdata
verbose = self.verbose
stop = 10000
#Loop through data
i = -1
while i < (len(data)-1):
i += 1
#If the data entry is a name in the dictionary loop through the
#following values till you reach another name in the dictionary
if (data[i] in name_dict):
name = data[i]
#verbose true print name
if verbose ==True:
print(name)
#Only append values that are not one of the names in the dictionary
pleasestop =data[i+1]
a = -1
Dont_Append = False
while (pleasestop not in name_dict):
i+=1
a+=1
#Removing Page marker
if pleasestop =="-":
Dont_Append = True
stop = a
if a>=stop:
try:
data[i+1]
except:
break
pleasestop= data[i+1]
else:
#Check that value can be converted to interger (there could be words at the end)
try:
float(''.join(data[i].split(',')))
except:
break
#Just a measure to make sure data is correctly formated
if verbose==True:
print(data[i])
if name =="STATE TOTAL":
if data[i]=="1":
break
#Append list to the dictionary with corresponding key
name_dict[name].append(float(''.join(data[i].split(','))))
#Just in case the i exceeds the condition of the while loop
try:
data[i+1]
except:
break
pleasestop= data[i+1]
self.name_dict=name_dict
def column_append(self):
"""
Appends column names to list
"""
#Defining variables
c_list = self.columns
year=self.year
data = self.rawdata
app = False
#Number of columns to append (it has to change depending on if 1st or 2nd page)
if len(c_list)==0:
k = len(self.name_dict[list(self.name_dict.keys())[len(self.name_dict.keys())-1]])
else:
k = len(self.name_dict[list(self.name_dict.keys())[len(self.name_dict.keys())-1]])-len(c_list)-3
app = True
#Loop through data till you find first year
j =-1
while j < (len(data)-1):
j += 1
if (data[j] ==str(year-13)) | (data[j] ==str(year-29)):
#Loop through data untill all columns are appended
while k != 0:
#We dont want blanks
if (data[j] != 'County') | (data[j] !="Non-"):
#For the first 3, it is "[Oldest Year] & Earlier".
#Since we dont want "& Earlier" to subtract from k
#and indiviualy be added, we create a sting combining
#them all together.
if (data[j]=="&"):
new_item = f"{c_list[0]} {data[j]} Earlier"
c_list.append(new_item)
c_list.pop(0)
#Append entry to columns
else:
c_list.append(data[j])
k-=1
j+=1
#if second entry, then add these final three.
if app ==True:
c_list.append("Bearing")
c_list.append("Non-bearing")
c_list.append("Total")
self.columns = c_list
#Loop through years 2008 to 2021.
for year in range(1996, 2022):
#Create a dictionary of counties to append to
county = {"Amador": [], 'Butte': [ ], "Calaveras": [ ], 'Colusa': [ ], 'Contra Costa': [ ],
'Fresno': [ ], 'Glenn': [ ], 'Kern': [ ], 'Kings': [ ], 'Lake': [ ],
'Los Angeles': [ ], 'Madera': [ ], 'Merced': [ ], 'Monterey': [ ], "Placer": [ ], 'Riverside': [ ],
'Sacramento': [ ], 'San Benito': [ ], 'San Bernardino': [ ], 'San Joaquin': [ ],
'Shasta': [ ],'San Luis Obispo': [ ], "Santa Clara": [ ], "Napa": [ ], "El Dorado": [ ], "Santa Barbara": [ ],
"Sonoma": [ ], 'Solano': [ ], 'Stanislaus': [ ], 'Sutter': [ ], 'Tehama': [ ],
'Tulare': [ ], 'Yolo': [ ], 'Yuba': [ ], 'All Others': [ ], 'State Total': [ ]}
#empty list of columns
columns = []
#Create file paths for input and output according to year
infile = f"{year}.pdf"
outfile = f"county_{year}.csv"
pdf_path = os.path.join("..", "Resources", infile)
output = os.path.join("..", "Output", outfile)
#Do both pages
for i in range(1, 3):
#Using ALMOND_PDF, see function.py for details
data = ALMOND_PDF(year, name_dict =county, n= i, columns=columns, pdf_path =pdf_path)
data.clean_data()
data.row_check()
data.append_acres()
data.column_append()
#The acreage reports do not have same counties everytime, need to fill with nans if this is the case
list_Nan = [None for i in range(33)]
for l in county:
if len(county[f"{l}"])==0:
county[f"{l}"] = list_Nan
#Create Dataframe
df = pd.DataFrame.from_dict(county, orient='index')
#df = df.T
df.columns = columns
#Export
df.to_csv(output)
#Creating a new column which will regularize by age rather than year planted
new_columns = ["Over 30", "29", "28", "27", "26", "25", "24", "23", "22", "21", "20", "19", "18", "17", "16",
"15", "14", "13", "12", "11", "10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "Bearing", "Non-Bearing", "Totals"]
#File Path for 2021
infile = "county_2021.csv"
csv_path = os.path.join("..", "Output", infile)
#Open 2021 county data create initial dataframe with new column to append to
df = pd.read_csv(csv_path, index_col=0)
df.columns = new_columns
#Clean up the dataframe
df["Year"] = 2021
df.reset_index(inplace=True)
df = df.rename(columns = {'index':'County'})
#New DF for the state totals and for the counties
df_totals = pd.DataFrame()
df_totals[2021] = df.iloc[35]
df = df.drop(df.index[35])
df1 = pd.get_dummies(df, columns=["County"])
df_totals = df_totals.T
#loop through the years and append to the above dataframes
year = 2020
while year != 1995:
#Open files
infile = f"county_{year}.csv"
csv_path = os.path.join("..", "Output", infile)
df = pd.read_csv(csv_path, index_col=0)
#clean
df.columns = new_columns
df["Year"] = year
df.reset_index(inplace=True)
df = df.rename(columns = {'index':'County'})
#New DF, SAME AS ABOVE
df_totals2 = pd.DataFrame()
df_totals2[year] = df.iloc[len(df)-1]
df_totals2 = df_totals2.T
df = df.drop(df.index[len(df)-1])
df = pd.get_dummies(df, columns=["County"])
year-=1
#Append dataframes
df1= df1.append(df, ignore_index=True)
df_totals =df_totals2.append(df_totals, ignore_index=True)
#final cleaning
df_totals = df_totals.drop(columns="County")
#Output and exporting
outfile1 = f"Almond_Acreage_County.csv"
outfile2 = f"Almond_Acreage_State.csv"
output1 = os.path.join("..", "Output", outfile1)
output2 = os.path.join("..", "Output", outfile2)
df1.to_csv(output1)
df_totals.to_csv(output2)
df1.head
Over 30 | 29 | 28 | 27 | 26 | 25 | 24 | 23 | 22 | 21 | ... | County_Santa Clara | County_Shasta | County_Solano | County_Sonoma | County_Stanislaus | County_Sutter | County_Tehama | County_Tulare | County_Yolo | County_Yuba | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 4646.0 | 396.0 | 627.0 | 1205.0 | 1469.0 | 1045.0 | 1312.0 | 795.0 | 1285.0 | 866.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 1746.0 | 413.0 | 893.0 | 844.0 | 603.0 | 1161.0 | 1658.0 | 1983.0 | 1271.0 | 1790.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 16.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
905 | 135.0 | 7.0 | 0.0 | 13.0 | 9.0 | 109.0 | 248.0 | 17.0 | 206.0 | 0.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
906 | 69.0 | 89.0 | 63.0 | 14.0 | 0.0 | 178.0 | 40.0 | 559.0 | 219.0 | 102.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
907 | 1716.0 | 28.0 | 26.0 | 157.0 | 24.0 | 46.0 | 32.0 | 52.0 | 18.0 | 33.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
908 | 379.0 | 0.0 | 3.0 | 62.0 | 1.0 | 8.0 | 69.0 | 62.0 | 170.0 | 0.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
909 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
910 rows × 69 columns
df_totals.head()
Over 30 | 29 | 28 | 27 | 26 | 25 | 24 | 23 | 22 | 21 | ... | 6 | 5 | 4 | 3 | 2 | 1 | Bearing | Non-Bearing | Totals | Year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 34319.0 | 4834.0 | 4541.0 | 9239.0 | 9009.0 | 10187.0 | 13578.0 | 19786.0 | 14861.0 | 6632.0 | ... | 11592.0 | 12000.0 | 16862.0 | 23017.0 | 25761.0 | 17307.0 | 400692.0 | 66085.0 | 466777.0 | 1996 |
1 | 36200.3 | 3653.3 | 7936.6 | 7427.9 | 9392.2 | 12345.6 | 18448.8 | 13723.9 | 6486.7 | 4919.5 | ... | 11459.1 | 17770.9 | 22822.7 | 27781.9 | 21827.9 | 13348.7 | 409094.4 | 62958.5 | 472052.9 | 1997 |
2 | 33111.0 | 6295.0 | 6499.0 | 6940.0 | 10214.0 | 15987.0 | 11742.0 | 5799.0 | 4067.0 | 12119.0 | ... | 18637.0 | 25322.0 | 30777.0 | 27481.0 | 24424.0 | 28727.0 | 420483.0 | 80632.0 | 501115.0 | 1998 |
3 | 33505.0 | 5599.0 | 5293.0 | 8651.0 | 14245.0 | 11255.0 | 5482.0 | 4097.0 | 11615.0 | 21160.0 | ... | 31103.0 | 35845.0 | 33233.0 | 30989.0 | 37048.0 | 22616.0 | 457868.0 | 90654.0 | 548522.0 | 1999 |
4 | 31000.0 | 3744.0 | 7452.0 | 12409.0 | 10291.0 | 4818.0 | 3651.0 | 10788.0 | 19909.0 | 27341.0 | ... | 36714.0 | 34628.0 | 33365.0 | 40691.0 | 26369.0 | 13478.0 | 472463.0 | 80537.0 | 553000.0 | 2000 |
5 rows × 34 columns