#import modules
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import os
import PyPDF2
from itertools import chain
import pdfplumber


from functions import ALMOND_PDF


#Class to use to convert pdf's from the Almond Board
class ALMOND_PDF: 
    """
    Using pdfplumber, extract data from the almond acreage report.
    
    Parameters
    ----------
    year : int
         The year of report you are looking for.
    name_dict: dictionary
        A list of values such as counties or varieties
    n : int
        The page number, default is 1. 
    verbose: boolean
        Should function show extract info to debug? 
    
    
    """
    def __init__(self, year, name_dict, pdf_path, columns, n=1, verbose=False):
        self.name_dict = name_dict
        self.year= year
        self.n = n
        self.verbose=verbose
        self.columns=columns
        
        # Opening pdf and then reading with pdf plumber
        with pdfplumber.open(pdf_path) as pdf:
            self.rawdata = pdf.pages[n].extract_text().split('\n')
        
    def clean_data(self, empty=''):
        
        """
        The goal of the clean data function is to regularize the data, regardless of which pdf you start out with.
        While some of the pdfs spit out data which each value in the list is a string of all values in a row, others
        spit out a list which all entries are represented as a values in the list. What
        we want is a list with where each value is an entry in the list. 
        
        """
        rawdata = self.rawdata
        #Creating empty list and looping through data
        l = []
        for i in range(len(rawdata)):
            
            a = rawdata[i]
            
            
            # if the value is a string containing mulitple values, split and create list
            z = a.split(' ')
            
            try: 
                int(z[0])
                if i>3:
                    z.pop(0)
                
                # appending to list
                l.append(z)
            except:
                l.append(z)

        #using ittertools chain function turn this list containing list into just one list
        rawdata = list(chain(*l))
        
        self.rawdata = rawdata
        
        # if verbose is true show data
        if self.verbose ==True:

            self.rawdata
        
    def row_check(self):
        """
        Takes data and checks for the row values searching for. Also fixes string names which may have been 
        sepperated into multiple values (e.g. "San Luis Obispo"). Prints the rows not found if verbose
        """
        
        #define your variables
        name_dict = self.name_dict
        data = self.rawdata
        verbose = self.verbose
        
        #List to append values to
        rowlist = []
        i = -1
        
        #loop through data to find values in the name_list and append them
        while i < (len(data)-1):
            i += 1
            if (data[i] in name_dict):
                rowlist.append(data[i])

        #Check if there are missing values
        Missing = list(set(name_dict) - set(rowlist)) 

        if len(Missing) == 0:
            print("No missing vaules!")
            
        #If there are missing values begin your search (likely just names with multiple words)
        else:
            continu = "Y"
            
            #prints out data if verbose
            if verbose==True:
                    print("-------------")
                    while i < (len(data)-1):
                        i += 1
                        if (data[i] != ' '):
                                if (data[i] not in name_dict):
                                    print(data[i])
                    print("-------------")
                    continu = input("Begin search for missing value Y or N?")
                    print(f"These values are missing: {Missing}")
            
            while continu != "N":
                
                #loop through missing values
                for M in range(len(Missing)):
                    
                    #Get just first word in the name
                    correct = Missing[M]
                    z = correct.split(' ')
                    error = z[0]
                    found = "N"
                        
                    #loop through data
                    i = -1
                    while i < (len(data)-1):
                        i += 1
                        
                        #If data value equals the first word of the missing value then:
                        try:
                            
                            if (data[i].lower().capitalize() == z[0]):

                                try:
                                    if data[i+1].lower().capitalize()==z[1]:

                                        try:
                                            if data[i+2].lower().capitalize()==z[2]:
                                                data[i] = correct
                                                data.pop(i+1)
                                                data.pop(i+1)
                                                found ="Y"
                                                break

                                        except:
                                            data[i] = correct
                                            data.pop(i+1)
                                            found ="Y"
                                            break

                                except:
                                    data[i] = correct
                                    found ="Y"
                                    break
                        except:
                            pass
                           
                        #If verbose equals true then show what values are missing
                        if verbose == True:
                            if found == "N":
                                error = print(f"Nothing Found for {Missing[M]}")
                                found= "Y"
                continu = "N"
        self.rawdata = data
        
    def append_acres(self):
        """"
        Appends lists to the name_dict the corresponding the key.
        """
        
        #define your variables
        name_dict = self.name_dict
        data = self.rawdata
        verbose = self.verbose
        stop = 10000
        #Loop through data 
        i = -1
        while i < (len(data)-1):
            i += 1
            
            #If the data entry is a name in the dictionary loop through the
            #following values till you reach another name in the dictionary
            if (data[i] in name_dict):
                 
                name = data[i]
                
                #verbose true print name
                if verbose ==True:
                    print(name)
                
                #Only append values that are not one of the names in the dictionary
                pleasestop =data[i+1]
                a = -1
                Dont_Append = False
                while (pleasestop not in name_dict):
                    i+=1
                    a+=1
                     
                    #Removing Page marker
                    if pleasestop =="-":
                        Dont_Append = True
                        stop = a
                    
                    if a>=stop:
                        try:
                            data[i+1]
                        except:
                            break
                            
                        
                        
                        pleasestop= data[i+1]
                        
                    else:
                        #Check that value can be converted to interger (there could be words at the end)
                        try:
                            float(''.join(data[i].split(',')))
                        except:
                            break
                        



                        #Just a measure to make sure data is correctly formated    
                        if verbose==True:
                            print(data[i])

                        if name =="STATE TOTAL":
                            if data[i]=="1":
                                break

                        #Append list to the dictionary with corresponding key
                        name_dict[name].append(float(''.join(data[i].split(','))))

                        #Just in case the i exceeds the condition of the while loop
                        try:
                            data[i+1]
                        except:
                            break

                        pleasestop= data[i+1]
                        
                        
                    
                
                    
        self.name_dict=name_dict 
        
    def column_append(self):
        """
        Appends column names to list
        """
        
        #Defining variables
        c_list = self.columns
        year=self.year
        data = self.rawdata
        app = False
        
        #Number of columns to append (it has to change depending on if 1st or 2nd page)
        if len(c_list)==0:
            k = len(self.name_dict[list(self.name_dict.keys())[len(self.name_dict.keys())-1]])
        else:
            k = len(self.name_dict[list(self.name_dict.keys())[len(self.name_dict.keys())-1]])-len(c_list)-3
            app = True
        
        #Loop through data till you find first year
        j =-1
        while j < (len(data)-1):
            j += 1
            if (data[j] ==str(year-13)) | (data[j] ==str(year-29)):
              
                #Loop through data untill all columns are appended
                while k != 0:
                    
                    
                    #We dont want blanks
                    if (data[j] != 'County') | (data[j] !="Non-"):
                        
                        #For the first 3, it is "[Oldest Year] & Earlier".
                        #Since we dont want "& Earlier" to subtract from k
                        #and indiviualy be added, we create a sting combining
                        #them all together.
                        if (data[j]=="&"):
                            new_item = f"{c_list[0]} {data[j]} Earlier"
                            c_list.append(new_item)
                            c_list.pop(0)
                            
                            
                        #Append entry to columns
                        else:
                            c_list.append(data[j])
                            k-=1     
                    j+=1
        
        #if second entry, then add these final three.
        if app ==True:
            c_list.append("Bearing")
            c_list.append("Non-bearing")
            c_list.append("Total")
        self.columns = c_list


#Loop through years 2008 to 2021.
for year in range(1996, 2022):
    
    #Create a dictionary of counties to append to
    county = {"Amador": [], 'Butte': [ ], "Calaveras": [ ], 'Colusa': [ ], 'Contra Costa': [ ],
              'Fresno': [ ], 'Glenn': [ ], 'Kern': [ ], 'Kings': [ ], 'Lake': [ ],
              'Los Angeles': [ ], 'Madera': [ ], 'Merced': [ ], 'Monterey': [ ], "Placer": [ ], 'Riverside': [ ],
              'Sacramento': [ ], 'San Benito': [ ], 'San Bernardino': [ ], 'San Joaquin': [ ],
              'Shasta': [ ],'San Luis Obispo': [ ], "Santa Clara": [ ], "Napa": [ ], "El Dorado": [ ], "Santa Barbara": [ ], 
              "Sonoma": [ ], 'Solano': [ ], 'Stanislaus': [ ], 'Sutter': [ ], 'Tehama': [ ],
              'Tulare': [ ], 'Yolo': [ ], 'Yuba': [ ], 'All Others': [ ], 'State Total': [ ]}

    #empty list of columns
    columns = []

    #Create file paths for input and output according to year
    infile = f"{year}.pdf"
    outfile = f"county_{year}.csv"
    pdf_path = os.path.join("..", "Resources", infile)
    output = os.path.join("..", "Output", outfile)

    #Do both pages
    for i in range(1, 3):

        #Using ALMOND_PDF, see function.py for details
        data = ALMOND_PDF(year, name_dict =county, n= i, columns=columns, pdf_path =pdf_path)
        data.clean_data()
        data.row_check()
        data.append_acres()
        data.column_append()

    #The acreage reports do not have same counties everytime, need to fill with nans if this is the case
    list_Nan = [None for i in range(33)]
    for l in county:
        if len(county[f"{l}"])==0:
            county[f"{l}"] = list_Nan

    #Create Dataframe
    df = pd.DataFrame.from_dict(county, orient='index') 
    #df = df.T
    df.columns = columns

    #Export
    df.to_csv(output)


#Creating a new column which will regularize by age rather than year planted
new_columns = ["Over 30", "29", "28", "27", "26", "25", "24", "23", "22", "21", "20", "19", "18", "17", "16",
               "15", "14", "13", "12", "11", "10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "Bearing", "Non-Bearing", "Totals"]


#File Path for 2021
infile = "county_2021.csv"
csv_path = os.path.join("..", "Output", infile)

#Open 2021 county data create initial dataframe with new column to append to 
df = pd.read_csv(csv_path, index_col=0)
df.columns = new_columns

#Clean up the dataframe
df["Year"] = 2021
df.reset_index(inplace=True)
df = df.rename(columns = {'index':'County'})


#New DF for the state totals and for the counties
df_totals = pd.DataFrame()
df_totals[2021] = df.iloc[35]
df = df.drop(df.index[35])

df1 = pd.get_dummies(df, columns=["County"])
df_totals = df_totals.T


#loop through the years and append to the above dataframes
year = 2020
while year != 1995:
    
    #Open files
    infile = f"county_{year}.csv"
    csv_path = os.path.join("..", "Output", infile)
    df = pd.read_csv(csv_path, index_col=0)
    
    #clean
    df.columns = new_columns
    df["Year"] = year
    df.reset_index(inplace=True)
    df = df.rename(columns = {'index':'County'})
    
    #New DF, SAME AS ABOVE
    df_totals2 = pd.DataFrame()
    df_totals2[year] = df.iloc[len(df)-1]
    df_totals2 = df_totals2.T
    df = df.drop(df.index[len(df)-1])
    df = pd.get_dummies(df, columns=["County"])
    
    year-=1
    
    #Append dataframes
    df1= df1.append(df, ignore_index=True)
    df_totals =df_totals2.append(df_totals, ignore_index=True)


#final cleaning
df_totals = df_totals.drop(columns="County")

#Output and exporting
outfile1 = f"Almond_Acreage_County.csv"
outfile2 = f"Almond_Acreage_State.csv"
output1 = os.path.join("..", "Output", outfile1)
output2 = os.path.join("..", "Output", outfile2)

df1.to_csv(output1)
df_totals.to_csv(output2)


df1.head


df_totals.head()

	Over 30	29	28	27	26	25	24	23	22	21	...	County_Santa Clara	County_Shasta	County_Solano	County_Sonoma	County_Stanislaus	County_Sutter	County_Tehama	County_Tulare	County_Yolo	County_Yuba
0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0	0	0	0	0	0	0	0	0	0
1	4646.0	396.0	627.0	1205.0	1469.0	1045.0	1312.0	795.0	1285.0	866.0	...	0	0	0	0	0	0	0	0	0	0
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0	0	0	0	0	0	0	0	0	0
3	1746.0	413.0	893.0	844.0	603.0	1161.0	1658.0	1983.0	1271.0	1790.0	...	0	0	0	0	0	0	0	0	0	0
4	16.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
905	135.0	7.0	0.0	13.0	9.0	109.0	248.0	17.0	206.0	0.0	...	0	0	0	0	0	0	1	0	0	0
906	69.0	89.0	63.0	14.0	0.0	178.0	40.0	559.0	219.0	102.0	...	0	0	0	0	0	0	0	1	0	0
907	1716.0	28.0	26.0	157.0	24.0	46.0	32.0	52.0	18.0	33.0	...	0	0	0	0	0	0	0	0	1	0
908	379.0	0.0	3.0	62.0	1.0	8.0	69.0	62.0	170.0	0.0	...	0	0	0	0	0	0	0	0	0	1
909	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	0	0	0	0	0	0	0	0	0	0

	Over 30	29	28	27	26	25	24	23	22	21	...	6	5	4	3	2	1	Bearing	Non-Bearing	Totals	Year
0	34319.0	4834.0	4541.0	9239.0	9009.0	10187.0	13578.0	19786.0	14861.0	6632.0	...	11592.0	12000.0	16862.0	23017.0	25761.0	17307.0	400692.0	66085.0	466777.0	1996
1	36200.3	3653.3	7936.6	7427.9	9392.2	12345.6	18448.8	13723.9	6486.7	4919.5	...	11459.1	17770.9	22822.7	27781.9	21827.9	13348.7	409094.4	62958.5	472052.9	1997
2	33111.0	6295.0	6499.0	6940.0	10214.0	15987.0	11742.0	5799.0	4067.0	12119.0	...	18637.0	25322.0	30777.0	27481.0	24424.0	28727.0	420483.0	80632.0	501115.0	1998
3	33505.0	5599.0	5293.0	8651.0	14245.0	11255.0	5482.0	4097.0	11615.0	21160.0	...	31103.0	35845.0	33233.0	30989.0	37048.0	22616.0	457868.0	90654.0	548522.0	1999
4	31000.0	3744.0	7452.0	12409.0	10291.0	4818.0	3651.0	10788.0	19909.0	27341.0	...	36714.0	34628.0	33365.0	40691.0	26369.0	13478.0	472463.0	80537.0	553000.0	2000

PDF Conversion for Almond Acearage Reports¶

The Conversion¶

Combining the CSV's¶