#==============================================================================#
#                                GrainSizeTools                                #
#       A Python script to estimate a representative value of grain size       #
#              for paleopiezometry and paleowattometry studies                 #
#                                                                              #
#                 Copyright (c) 2014   Marco A. Lopez-Sanchez                  #
#                                                                              #
#    GrainSizeTools is free software: you can redistribute it and/or modify    #
#    it under the terms of the GNU General Public License as published by      #
#    the Free Software Foundation, either version 3 of the License, or         #
#    (at your option) any later version.                                       #
#                                                                              #
#    GrainSizeTools is distributed in the hope that it will be useful,         #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of            #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the              #
#    GNU General Public License for more details.                              #
#                                                                              #
#    You should have received a copy of the GNU General Public License         #
#    along with GrainSizeTools. If not, see <http://www.gnu.org/licenses/>.    #
#                                                                              #
#                                                                              #
#    Version 0.2                                                               #
#    For details see: https://sourceforge.net/projects/grainsizetools/         #
#    or http://sourceforge.net/p/grainsizetools/wiki/Home/                     #
#                                                                              #
#    Requirements:                                                             #
#        Python version 2.7.x or 3.4.x                                         #
#        Numpy version 1.5 or higher                                           #
#        Matplotlib version 1.4.2 or higher                                    #
#        Scipy version 0.13 or higher                                          #
#==============================================================================#

from __future__ import division, print_function # this is to avoid python 2.x - 3.x compatibility issues
import numpy as np
from numpy import mean, std, median, pi, sqrt
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

import matplotlib as mpl
mpl.style.use('ggplot')
#mpl.rcParams['font.family'] = 'Arial' # uncomment this to set a user-defined font in plots
mpl.rcParams['xtick.labelsize'] = 12.
mpl.rcParams['ytick.labelsize'] = 12.

def importdata(filePath):
    """ Load the data from a txt or csv file and returns a numpy array with the
    values of the dataset. To avoid problems use forward slash or double backslash
    in the filePath (e.g. "C:/yourfilelocation.../nameofthefile.txt") instead of 
    single backslash.

    PARAMETER
    filePath: the file location in the OS in quotes
    """
    try:
        dataSet = np.genfromtxt(filePath)

    except IOError:
        print (' ')
        print ('importfile could not open:', filePath)
        print (' ')
        print ('MAKE SURE you have NOT used single backslash to enter the file location')

    return dataSet


def calc_diameters(areas, addPerimeter = 0):
    """ Calculate the diameters from the sectional areas assuming that the grains have
    near-equant shapes.

    PARAMETERS
    areas: a numpy array with the sectional areas of the grains
    addPerimeter: a float or integer. Correct the diameters estimated from the areas 
            adding the perimeter of the grain. If addPerimeter is not declared, it is
            considered 0
    """

    # calculate diameters via equivalent circular diameter d = 2*((area/pi)^(1/2))
    diameters = 2*sqrt(areas/pi)
    
    # diameter correction adding edges (if applicable)
    if addPerimeter != 0:
        diameters = diameters + addPerimeter
    
    return diameters


def find_grain_size(areas, diameters, binsize='FD'):
    """Estimate a representative numeric value of grain size from the
    population of apparent diameters and areas.
    
    PARAMETERS
    areas: a numpy array with the sectional areas of the grains
    diameters: a numpy array with the diameters of the grains
    binsize: the method used to calculate the bin size. This can be 'FD'
             (Fredman-Diaconis rule), 'Scott' (Scott rule) or a scalar constant
             of type integer or float. If not specified, 'FD' is used by default.
    """
    
    # determine de bin size according to the method chosen
    if binsize == 'FD':
        h = get_FD_binsize(diameters)
    elif binsize == 'Scott':
        h = get_Scott_binsize(diameters)
    elif type(binsize) == int or type(binsize) == float:
        h = binsize
    else:
        print (binsize, " is not an integer or float nor 'FD' or 'Scott'")
        return None
            
    # determine the grain size parameters using the number-weighted approach
    binList, xgrid, y_values, y_max, x_peak, mean_GS, median_GS = calc_numberweighted(diameters, h)
    
    # determine the grain size using the area-weighted approach
    cumulativeAreas, intValues, weightedMean = calc_areaweighted(areas, diameters, h)
    
    # generate the number- and area-weighted plots
    generate_plot(diameters, binList, xgrid, y_values, y_max, x_peak, mean_GS, median_GS, intValues, cumulativeAreas, h, weightedMean)
    
    return None
    

#==================================================================================================#
# functions used by the find_grain_size function to obtain all the parameters needed to estimate the
# grain size and generate the plots. The names of the functions are self-explanatory.
#==================================================================================================#

def calc_numberweighted(diameters, binsize):
    """ Calculate the histogram and the Gaussian kernel density estimator of the 
    grain diameters population. It prints the modal interval, the middle value of
    the modal interval and the Gaussian kernel density estimation peak. It returns
    a list with the histogram bin edges, the x and y values neccesary to build the 
    Gaussian kde curve and the x and y location of the Gaussian kde peak. This 
    values will be used later to make the number weighted plot.

    PARAMETERS
    diameters: A numpy array with the diameters of the grains
    binsize: an integer or float.
    """

    mean_GS = mean(diameters)
    median_GS = median(diameters)

    # create an array with the bin edges to later compute the histogram
    binList = np.arange(0., diameters.max()+binsize, binsize)

    histogram = np.histogram(diameters, bins=binList) # compute the histogram

    # find the grain size range in which the histogram value reach is the maximum    
    index = np.argmax(histogram[0]) # get the index of the maximum value (strictly speaking it finds the left edge of the modal interval)
    modInt_leftEdge = binList[index]
    modInt_rightEdge = modInt_leftEdge + binsize
    print (' ')
    print ('NUMBER WEIGHTED APPROACH:')
    print (' ')
    print ('Mean grain size =', round(mean_GS, 2), 'microns')
    print ('Median grain size =', round(median_GS, 2), 'microns')
    print (' ')
    print ('The modal interval is', modInt_leftEdge, '-', modInt_rightEdge)
    print ('Middle value =', round((modInt_leftEdge+modInt_rightEdge)/2., 1), 'microns')


    # calculate the Gaussian kernel density function
    # the bandwidth selection is based on the Silverman rule (Silverman 1986)
    kde = gaussian_kde(diameters, bw_method=my_kde_bandwidth)
    

    # determine where the Gaussian kde function reach it maximum value
    xmin = diameters.min()
    xmax = diameters.max()
    diameter_range = xmax - xmin
    if diameter_range < 400:
        density = 2**12
    else:
        density = 2**14
    xgrid = np.linspace(xmin, xmax, density) # generate x-values
    y_values = kde(xgrid) # generate y-values using the gaussian kde function estimated
    y_max = np.max(y_values) # get y maximum value
    index = np.argmax(y_values) # numpy.argmax get the index of the maximum value along an axis, in this case the y-axis
    x_peak = xgrid[index] # get the diameter (x-value) where y-value is maximum
    print (' ')
    print ('Gaussian KDE peak = ', round(x_peak, 2), 'microns')
    print ('Bandwidth =', round(kde.covariance_factor()*diameters.std(ddof=1), 2), '(Silverman rule)')

    return binList, xgrid, y_values, y_max, x_peak, mean_GS, median_GS

    
def get_FD_binsize(dataSet):
    """ Uses the Freedman-Diaconis rule (Freedman and Diaconis 1981) to calculate
    the bin size for the dataset for use in making a histogram. Returns the bin
    size h rounded to one decimal.

    Reference:
    Freedman D & Diaconis (1981) Zeitschrift fur Wahrscheinlichkeitstheorie und verwandte Gebiete 57:453-476

    PARAMETER
    dataSet: A numpy array with the dataset"""

    # Calculate Lower and Upper Quartile
    lowQuart = np.percentile(dataSet, 25)
    upperQuart = np.percentile(dataSet, 75)
        
    # calculate the interquartile range: IQR
    IQR = upperQuart - lowQuart
    
    # calculate the number of bins according Freedman-Diaconis rule; k = 2*IQR/(n^(1/3))
    h = 2*IQR/(len(dataSet)**(1./3))
    print (' ')
    print ("The bin size according to the Freedman-Diaconis rule is", round(h, 1))

    return round(h, 1)
    
def get_Scott_binsize(dataSet):
    """ Uses the Scott rule (Scott 1979) to calculate the bin size for
    the data set for use in making a histogram. Scott rule is optimal 
    for random samples of normally distributed data. Returns the bin
    size h rounded to one decimal.

    Reference:
    Scott DW (1979) Biometrika 66:605-610

    PARAMETER
    dataSet: A numpy array with the dataset"""
    
    stdDev = std(dataSet)

    # calculate the number of bins according to Scott rule; k = 3.49*stdDev/(n^(1/3))
    h = 3.49*stdDev/(len(dataSet)**(1./3))
    print (' ')
    print ("The bin size according to the Scott rule is", round(h, 1))
    
    return round(h, 1)
    

def my_kde_bandwidth(obj, fac=1.):
    """Returns the Silverman rule for kde bandwidth multiplied by a constant factor
    if neccesary. Returns the bandwidth of the Gaussian kde. 

    obj: the kde object
    fac: the constant factor, 1. as default
    """

    bandwidth = np.power(obj.n*(obj.d+2.0)/4.0, -1./(obj.d+4))*fac

    return bandwidth


def calc_areaweighted(areas, diameters, binsize):
    """ Calculates the area percentage of equivalent radii of the dataset
    based on Herwegh (2000) and Berger et al. (2011) approach. Prints the
    the grain size interval with the maximum area accumulated, the middle 
    value of this interval and the area weighted arithmetic mean. Returns
    a list with the cumulative area of each interval defined, a list with
    the left edges of the intervals defined and the area weighted mean.
    This values will be used later to make the area weighted plot.

    References:
    Herwegh (2000) Journal of Structural Geology 22:391-400
    Berger et al. (2011) Journal of Structural Geology 33:1751-1763

    PARAMETERS
    areas: A numpy array with the sectional areas of the grains
    diameters: A numpy array with the estimated diameters of the grains
    binsize: the bin size, an integer or float
    """

    # calculate the area weighted arithmetic mean
    areatotal = float(sum(areas))
    weightedAreas = areas/areatotal
    weigtedDiameters = diameters * weightedAreas
    weightedMean = sum(weigtedDiameters)
    
    
    # sort numpy arrays
    areas.sort()
    diameters.sort()
    
    minVal = 0
    maxVal = max(diameters)
    
    # Create a Python dictionary with diameters as keys and the corresponding areas as values
    values = dict(zip(diameters,areas))
    
    #Initialize variables
    x = int(minVal)
    y = x + binsize
    cumulativeAreas = []
    intValues = []

    intValues.append(x)

    while True:
        suma = 0
        if x < maxVal:
            for key in values:
                if key > x and key <= y:
                    suma += values[key]
            cumulativeAreas.append(round(suma, 1)) # append the sum of the areas for each interval defined
            x = y
            y = x + binsize
            intValues.append(round(x, 2)) # append the lower/left edge of the interval
            
        else:
            cumulativeAreas.append(0) # add one element at the end of the list
            getIndex = cumulativeAreas.index(max(cumulativeAreas)) # get the index of the maximum value (the modal interval)
            print (' ')
            print ('AREA WEIGHTED APPROACH:')
            print (' ')
            print ('Area-weighted mean grain size =', round(weightedMean, 2), 'microns')
            print (' ')
            print ('The modal interval is', intValues[getIndex], '-', (intValues[getIndex]+binsize), 'microns')
            print ('Middle value =', round((intValues[getIndex]+(intValues[getIndex]+binsize))/2.0, 1), 'microns')
            print (' ')

            return cumulativeAreas, intValues, weightedMean


def generate_plot(diameters, binList, xgrid, y_values, y_max, x_peak, mean_GS, median_GS, intValues, cumulativeAreas, h, weightedMean):
    """ Generate and returns a plot containing two subplots: the number and the area weighted."""

    plt.figure(figsize=(15,5))
    
    # number weighted subplot
    plt.subplot(121)
    plt.hist(diameters, bins=binList, range=(0,diameters.max()), normed=True, color='#66C2A5', edgecolor='#EBF7F3') # histogram
    plt.plot([mean_GS, mean_GS], [0.0001, y_max], linestyle='-', color='#e7298a', label='mean grain size', linewidth=2)
    plt.plot([median_GS, median_GS], [0.0001, y_max], linestyle='-', color='#7570b3', label='median grain size', linewidth=2)
    plt.plot(xgrid, y_values, color='#252525', label='Gaussian KDE', linewidth=2) # Gaussian kde
    plt.ylabel('frequency', fontsize= 15)
    plt.xlabel('diameter ($\mu m$)', fontsize= 15)
    plt.title('Number weighted distribution', fontsize=16)
    plt.plot([x_peak], [y_max], 'o', color='#252525')
    plt.vlines(x_peak, 0.0001, y_max, linestyle='--', color='#252525', linewidth=2)
    plt.annotate('Gaussian KDE peak', xy=(x_peak, y_max), xytext=(+10, +30), label='peak')
    plt.legend(loc='upper right', fontsize=13)

    # area weighted subplot
    plt.subplot(122)

    # normalize the y-axis values to percentage of the total area
    totalArea = sum(cumulativeAreas)    
    cumulativeAreasNorm = [(x/float(totalArea))*100 for x in cumulativeAreas]
    maxValue = max(cumulativeAreasNorm)
    
    # plotting stuff
    plt.bar(intValues, cumulativeAreasNorm, width=h, color='#66C2A5', edgecolor='#EBF7F3') # bar plot
    plt.plot([weightedMean, weightedMean], [0.0001, maxValue], linestyle='--', color='#252525', label='area weighted mean', linewidth=2)
    plt.ylabel('% of area fraction within the interval', fontsize=15)
    plt.xlabel('diameter ($\mu m$)', fontsize=15)
    plt.title('Area weighted distribution', fontsize=16)
    plt.legend(loc='upper right', fontsize=13)

    return plt.show()



print (' ')
print ('Welcome to the GrainSizeTools script v. 0.2')
print ('see release notes in the Readme.txt file')
print (' ')
