otimdb-eda

G B

#Exploratory Data Analysis of OTIM-DB

This notebook is a companion for the paper: OTIM-DB, a database of near-saturated hydraulic conductivity with management practice.

# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import folium
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import re
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings('ignore')

datadir = '../data/'
outputdir = '../figures/'
letters = 'abcdefghijklmnopqrstuvwxyz'
# read in OTIM database (new structure)
dfdic = pd.read_excel(datadir + 'OTIM-DB.xlsx', sheet_name=None, na_values=[-9999])
dfexp = dfdic['experiments']
dfloc = dfdic['locations']
dfref = dfdic['reference']
dfmeth = dfdic['method'].replace(-9999, np.nan)
dfsp = dfdic['soilProperties']
dfclim = dfdic['climate']
dfmod = dfdic['modelFit']
dfsm = dfdic['soilManagement'].replace(-9999, np.nan).replace(
    'no impact', 'consolidated').replace(  # estetic replacement
    'after tillage', 'soon after tillage')
dfraw = dfdic['rawData']
# merge tables
# NOTE good to merge on specific column (in case same column name appears...)
dfm = dfexp.merge(
    dfloc, how='left', on='Location').merge(
    dfref, how='left', on='ReferenceTag').merge(
    dfmeth, how='left', on='MethodName').merge(
    dfsp, how='left', on='SPName').merge(
    dfclim, how='left', on='ClimateName').merge(
    dfmod, how='left', on='MTFName').merge(
    dfsm, how='left', on='SMName')
dfm = dfm[dfm['ReferenceTag'].ne('kuhwald2017')].reset_index(drop=True)  # remove this entry as only one tension
print('number of entries (rows) initially in the db (as reported in papers):', dfm.shape[0])
number of entries (rows) initially in the db (as reported in papers): 1285

#Defining filters

Entries are kept if:

  • they come from topsoil: UpperD_m < 0.2 m
  • are well fitted: R2 > 0.9
  • measure the full range from 0 mm to 100 mm tension: Tmax >= 100 mm and Tmin == 0 mm
# define entries to keep but doesn't apply the filtering just yet
print('number of topsoil data: {:d} ({:.0f}%)'.format(
    dfm['UpperD_m'].lt(0.2).sum(), dfm['UpperD_m'].lt(0.2).sum()/dfm.shape[0]*100))
#i2keep = dfm['R2'].ge(0.9) & dfm['Tmax'].ge(80) & dfm['Tmin'].le(5) & dfm['UpperD_m'].lt(0.3) & dfm['SoilTextureFAO'].ne('organic')  & dfm['Direction'].eq('Dry to wet') & dfm['Method'].eq('Steady-state piecewise') #& dfm['ReferenceYear'].lt(2013)
i2keep = dfm['R2'].ge(0.9) & dfm['Tmin'].le(5) & dfm['Tmax'].ge(80) & dfm['UpperD_m'].le(0.3) & dfm['SoilTextureFAO'].ne('organic') 
#i2keep = dfm['R2'].ge(0.9) & dfm['Tmin'].le(5) & dfm['Tmax'].ge(80) & dfm['UpperD_m'].le(0.3) & dfm['SoilTextureFAO'].ne('organic') & dfm['MeanDiurnalRange'].ge(13)
i2wet = dfm['Tmax'].lt(100) & dfm['Tmin'].ge(0)
i2dry = dfm['Tmax'].ge(100) & dfm['Tmin'].ge(10)
i2comp = dfm['Tmax'].ge(100) & dfm['Tmin'].le(5)
i2w2d = dfm['Direction'].eq('Wet to dry')
print('number of bad merged rows:', i2keep.sum(), 'out of', i2keep.shape[0])
print('number of entries with short series at wet end:', i2wet.sum()/i2keep.shape[0])
print('number of entries with short series at dry end:', i2dry.sum()/i2keep.shape[0])
print('number of entries with complete series:', i2comp.sum()/i2keep.shape[0])
print('number of wet-to-dry:', i2w2d.sum()/i2keep.shape[0])
print('number of entries interpolated used:', dfm[i2keep]['Tmax'].between(80, 99).sum())
number of topsoil data: 1182 (92%)
number of bad merged rows: 476 out of 1285
number of entries with short series at wet end: 0.3906614785992218
number of entries with short series at dry end: 0.2
number of entries with complete series: 0.40077821011673154
number of wet-to-dry: 0.14007782101167315
number of entries interpolated used: 17
# list of columns grouped per categories
climate_factors_temp = [
    'AnnualMeanTemperature',
    'MeanTemperatureofWarmestQuarter',
    'MeanTemperatureofColdestQuarter', 
    'Isothermality',
    'TemperatureSeasonality',
    'MaxTemperatureofWarmestMonth',
    'MinTemperatureofColdestMonth',
    'TemperatureAnnualRange',
    'MeanTemperatureofWettestQuarter',
    'MeanTemperatureofDriestQuarter',
    'Elevation']

climate_factors_pre = [
    'AnnualPrecipitation',
    'PrecipitationofWettestMonth',
    'PrecipitationofDriestMonth',
    'PrecipitationSeasonality',
    'PrecipitationofWettestQuarter',
    'PrecipitationofDriestQuarter',
    'PrecipitationofWarmestQuarter',
    'PrecipitationofColdestQuarter',
    'MeanDiurnalRange',
    'Elevation',
#     'AverageAridityIndex',
#     'AverageAnnualEvapoTranspiration',
#     'AridityClass'
]

climate_factors = climate_factors_temp + climate_factors_pre

soil_factors = [
 'SoilTextureFAO',
 'SoilTextureUSDA',
 'ClayContent',
 'SiltContent',
 'SandContent',
 'BulkDensity',
 'SoilOrganicCarbon',
#  "SOC_trans"
]

method_factors = [
#  "Month1_sin",
#  "Month1_cos",
#  "Month2_sin",
#  "Month2_cos",
#  "Season_sin",
#  "Season_cos",
 'Method',
 'Direction',
 'Tmin',
 'Tmax',
 'UpperD_m',
 'Diameter'
]

management_factors = [
 'LanduseClass',
 'TillageClass',
 'NbOfCropRotation',
 'CropClass',
 'CoverCropClass',
 'ResidueClass',
 'GrazingClass',
 'IrrigationClass',
 'CompactionClass',
 'SamplingTimeClass',
 'AmendmentClass',
#  'ContainsCereals',
#  'ContainsFallow',
#  'ContainsGrassHerbs',
#  'ContainsLegume',
#  'ContainsMaize',
#  'ContainsTreesFruits',
#  'ContainsVegetables'
]

k_factors = [
 'log_Ks',
 'log_Kunsat',
 'log_k1',
 'log_k2',
 'log_k3',
 'log_k4',
 'log_k5',
 'log_k6',
 'log_k7',
 'log_k8',
 'log_k9',
 'log_k10'
]

factors = climate_factors + soil_factors + method_factors +\
            management_factors + k_factors
# print columns with object dtype to be sure they shouldn't be float
cols = climate_factors + soil_factors + method_factors + management_factors
dtypes = dfm[cols].dtypes
dtypes[dtypes == 'object']
SoilTextureFAO object SoilTextureUSDA object Method object Direction object LanduseClass object TillageClass object CropClass object CoverCropClass object ResidueClass object GrazingClass object IrrigationClass object CompactionClass object SamplingTimeClass object AmendmentClass object dtype: object
# some values are high but it's ok
dfm.sort_values('K1', ascending=False)[['MTFName', 'Ks', 'Kunsat', 'K1', 'slope', 'intercept']].head()
# removing super high values and computing log columns
kcols = ['Ks'] + ['K' + str(i+1) for i in range(10)]

# filtering out unreasonable values
ibad = dfm['K1'] > 2000
print('setting {:d} entries with K too high to NaN'.format(np.sum(ibad)))
dfm.loc[ibad, kcols] = np.nan

# given the wide range of hydraulic conductivity values, we compute the log of K
for kcol in kcols:
    dfm['log_' + kcol] = np.log10(dfm[kcol])
setting 0 entries with K too high to NaN

#Map

# https://stackoverflow.com/questions/32333870/how-can-i-show-a-km-ruler-on-a-cartopy-matplotlib-plot
import os
import cartopy.crs as ccrs
from math import floor
import matplotlib.pyplot as plt
from matplotlib import patheffects
import matplotlib



def utm_from_lon(lon):
    """
    utm_from_lon - UTM zone for a longitude

    Not right for some polar regions (Norway, Svalbard, Antartica)

    :param float lon: longitude
    :return: UTM zone number
    :rtype: int
    """
    return floor( ( lon + 180 ) / 6) + 1

def scale_bar(ax, proj, length, location=(0.52, 0.05), linewidth=2,
              units='km', m_per_unit=1000):
    """

    http://stackoverflow.com/a/35705477/1072212
    ax is the axes to draw the scalebar on.
    proj is the projection the axes are in
    location is center of the scalebar in axis coordinates ie. 0.5 is the middle of the plot
    length is the length of the scalebar in km.
    linewidth is the thickness of the scalebar.
    units is the name of the unit
    m_per_unit is the number of meters in a unit
    """
    # find lat/lon center to find best UTM zone
    x0, x1, y0, y1 = ax.get_extent(proj.as_geodetic())
    # Projection in metres
    utm = ccrs.UTM(utm_from_lon((x0+x1)/2))
    # Get the extent of the plotted area in coordinates in metres
    x0, x1, y0, y1 = ax.get_extent(utm)
    # Turn the specified scalebar location into coordinates in metres
    sbcx, sbcy = x0 + (x1 - x0) * location[0], y0 + (y1 - y0) * location[1]
    # Generate the x coordinate for the ends of the scalebar
    bar_xs = [sbcx - length * m_per_unit/2, sbcx + length * m_per_unit/2]
    # buffer for scalebar
    buffer = [patheffects.withStroke(linewidth=5, foreground="w")]
    # Plot the scalebar with buffer
    ax.plot(bar_xs, [sbcy, sbcy], transform=utm, color='k',
        linewidth=linewidth, path_effects=buffer)
    # buffer for text
    buffer = [patheffects.withStroke(linewidth=3, foreground="w")]
    # Plot the scalebar label
    t0 = ax.text(sbcx, sbcy, str(length) + ' ' + units, transform=utm,
        horizontalalignment='center', verticalalignment='bottom',
        path_effects=buffer, zorder=2)
    left = x0+(x1-x0)*0.05
    # Plot the N arrow
    t1 = ax.text(left, sbcy, u'\u25B2\nN', transform=utm,
        horizontalalignment='center', verticalalignment='bottom',
        path_effects=buffer, zorder=2)
    # Plot the scalebar without buffer, in case covered by text buffer
    ax.plot(bar_xs, [sbcy, sbcy], transform=utm, color='k',
        linewidth=linewidth, zorder=3)

fig = plt.figure(figsize=(10, 6))
#ax = plt.axes(projection=ccrs.InterruptedGoodeHomolosine())
ax = plt.axes(projection=ccrs.PlateCarree())
#plt.title('OTIMD entries')
#ax.set_extent([-180, 180, -80, 80])#, ccrs.Geodetic())
#ax.stock_img()
ax.coastlines()#resolution='10m')

scale_bar(ax, ccrs.PlateCarree(), 1000, location=(-2, 2))  # 100 km scale bar
# or to use m instead of km
# scale_bar(ax, ccrs.Mercator(), 100000, m_per_unit=1, units='m')
# or to use miles instead of km
# scale_bar(ax, ccrs.Mercator(), 60, m_per_unit=1609.34, units='miles')

# not used
xy = dfm[~i2keep][['Latitude', 'Longitude']].drop_duplicates().reset_index(drop=True).values
ax.plot(xy[:,1], xy[:,0], '.', color='tab:orange', transform=ccrs.PlateCarree())

# used
xy = dfm[i2keep][['Latitude', 'Longitude']].drop_duplicates().reset_index(drop=True).values
ax.plot(xy[:,1], xy[:,0], '.', color='tab:blue', transform=ccrs.PlateCarree())

#ax.set_aspect('auto')
matplotlib.rcParams.update({'font.size': 12})
ax.set_ylim([-90, 90])
cax1, = ax.plot([], [], '.', color='tab:blue', label='focus')
cax2, = ax.plot([], [], '.', color='tab:orange', label='extra')
ax.legend(handles=[cax1, cax2])
fig.tight_layout()
fig.savefig(outputdir + 'map.jpg', dpi=500)
plt.show()
<Figure size 720x432 with 1 Axes>

#Tensions availables

# available tension range
dfm2 = dfm.copy()

# interpolation we do when fitting K10
dfm2.loc[dfm2['Tmax'].ge(75), 'Tmax'] = 100

bins = np.arange(0, 100, 10)
def func(x):
    return np.round(x/10, 0)*10
dfm2['Tmin'] = dfm2['Tmin'].apply(func)
dfm2['Tmax'] = dfm2['Tmax'].apply(func)
dfm2.loc[dfm2['Tmax'].gt(100), 'Tmax'] = 100
dfm2['used'] = 'not used'
dfm2.loc[i2keep, 'used'] = 'used'

#dfcount = dfm2.groupby(['used', 'Tmin', 'Tmax']).count().reset_index()
#dfcount = dfcount[dfcount['Method'] > 0].reset_index(drop=True).sort_values('Method')
dfcount = dfm2.groupby(['used', 'Tmin', 'Tmax']).count().reset_index()
# get the order of the Tmin-Tmax range
df = dfcount.groupby(['Tmin', 'Tmax']).sum().sort_values('Method').reset_index()
tmin = df['Tmin'].values
tmax = df['Tmax'].values

# for the histograme per range
#dfcount2 = dfm2.groupby(['used', 'Tmin', 'Tmax']).count().reset_index()
#dfcount = dfcount.sort_values(['used', 'Method'], ascending=[False, False])
# iused = dfcount['used'].eq('used')
# count_used = dfcount[iused]['Method'].values
# y_used = np.arange(dfcount[iused].shape[0])
# inotused = dfcount['used'].eq('used')
# count_notused = dfcount[inotused]['Method'].values
# y_notused = np.arange(dfcount[inotused].shape[0])

# definitions for the axes
left, width = 0.1, 0.65
bottom, height = 0.1, 0.65
spacing = 0.005
rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom + height + spacing, width, 0.2]
rect_histy = [left + width + spacing, bottom, 0.2, height]

# start with a square Figure showing the range
fig = plt.figure(figsize=(8, 8))
ax = fig.add_axes(rect_scatter)
ax.barh(np.arange(len(tmin)), left=tmin, width=tmax - tmin, color='k')
ax.set_yticks(np.arange(len(tmin)))
ax.set_yticklabels([])
ax.grid(True)
ax.set_xlabel('Tensions [mm]')
ax.set_xlim([-5, 105])
#ax.set_ylim([-0.6, None])
ax.annotate('(b)', (1, 1), fontsize=14)

# histogram of number of entries per tension
ax_histx = fig.add_axes(rect_histx, sharex=ax)
#s1 = dfm[~i2keep][kcols].notnull().sum()#/dfm.shape[0]*100
#s2 = dfm[i2keep][kcols].notnull().sum()
#df = pd.DataFrame([s2, s1]).T.rename(columns={1: 'not used', 0: 'used'})
#df.plot.bar(stacked=True,
#           ylabel='Number of entries', ax=ax_histx)
s = dfm2[i2keep][kcols].notnull().sum()
ax_histx.bar(np.arange(0, s.shape[0])*10, s.values, width=6, label='focus')
s2 = dfm2[~i2keep][kcols].notnull().sum()
ax_histx.bar(np.arange(0, s2.shape[0])*10, s2.values, bottom=s.values,
             width=6, color='tab:orange', label='extra')
ax_histx.legend()
ax_histx.set_ylabel('Number of entries')
ax_histx.tick_params(axis="x", labelbottom=False)
ax_histx.annotate('(a)', (67.5, 1050), fontsize=14)

# histogram of number of entries per tension range
ax_histy = fig.add_axes(rect_histy, sharey=ax)
for i, (a, b) in enumerate(zip(tmin, tmax)):
    c1 = dfcount[dfcount['Tmin'].eq(a) 
                 & dfcount['Tmax'].eq(b) 
                 & dfcount['used'].eq('used')]['Method'].values
    c1 = c1[0] if len(c1) > 0 else 0
    ax_histy.barh(i, c1, color='tab:blue')
    c2 = dfcount[dfcount['Tmin'].eq(a) 
                 & dfcount['Tmax'].eq(b) 
                 & dfcount['used'].eq('not used')]['Method'].values
    c2 = c2[0] if len(c2) > 0 else 0
    ax_histy.barh(i, c2, left=c1, color='tab:orange')
#ax_histy.barh(y, width=count)
ax_histy.set_xlabel('Number of entries')
ax_histy.tick_params(axis="y", labelleft=False)
ax_histy.annotate('(c)', (250, 2), fontsize=14)

fig.savefig(outputdir + 'range-tension.jpg', dpi=500)
<Figure size 576x576 with 3 Axes>
# new entries as in table
cols = ['LanduseClass', 'TillageClass', 'CompactionClass', 'SamplingTimeClass']
inew = dfm['ReferenceYear'].ge(2013) & dfm['ReferenceTag'].apply(lambda x: len(x.split('_')) < 3).astype(bool)
def concat(x):
    return ' | '.join(x.unique())
dico = dict(zip(cols, [concat]*len(cols)))
dico.update({'RefID': 'count'})
df = dfm[inew].groupby('ReferenceTag').agg(dico).reset_index()
df = df.rename(columns={
    'ReferenceTag': 'Reference',
    'RefID': 'Number of entries',
    'LanduseClass': 'Land use',
    'TillageClass': 'Tillage',
    'CompactionClass': 'Compaction',
    'SamplingTimeClass': 'Sampling time'})
def formatref(x):
    return x[:-4].capitalize() + ' et al. ' + x[-4:]
df['Reference'] = df['Reference'].apply(formatref).replace({
    'Deboever et al. 2016': 'De Boever et al. 2016'
})
df.style.hide_index()
# how many mini-disk entries -> not so many
a = (dfm['Diameter'].eq(4.5) | dfm['Diameter'].eq(4.45)).sum()
dff = dfm.groupby('ReferenceTag').first()
b = (dff['Diameter'].eq(4.5) | dff['Diameter'].eq(4.45)).sum()
print('{:d}/{:d} entries in {:d}/{:d} papers'.format(a, len(dfm), b, len(dff)))
ie = dff['ReferenceYear'].gt(2013)
b = (dff[ie]['Diameter'].eq(4.5) | dff[ie]['Diameter'].eq(4.45)).sum()
print('after 2013: {:d}/{:d} entries in {:d}/{:d} papers'.format(a, len(dfm), b, len(dff)))

dff = dfm.groupby('ReferenceTag').first()
print(dff[dff['Diameter'].eq(4.5)].index)

#dfm['Diameter'].value_counts()
148/1285 entries in 10/171 papers
after 2013: 148/1285 entries in 9/171 papers
Index(['Caldwell_2008_GRL', 'Fasinmirin2018', 'Lopes2020', 'Wanniarachchi2019',
       'batkova2020', 'hallam2020', 'lozano2020', 'wang2022', 'zhao2014'],
      dtype='object', name='ReferenceTag')

#Publication year of papers

# number of papers per publication year
dfref = dfdic['reference']
fig, ax = plt.subplots()
pubin = dfm[i2keep].groupby('ReferenceTag').first()['ReferenceYear']
pubout = dfm[~i2keep].groupby('ReferenceTag').first()['ReferenceYear']
pubout = pubout[~pubout.index.isin(pubin.index)]  # ref is "in" if at least one pub is taken after filtering
ax.hist([pubin, pubout], bins=np.arange(1984, 2024, 1), stacked=True)
ax.set_xlabel('Publication year')
ax.set_ylabel('Number of publications')
ax.legend(['focus', 'extra'])
fig.savefig(outputdir + 'publication-years.jpg', dpi=300)
<Figure size 432x288 with 1 Axes>
count, year = np.histogram(dfref[dfref['ReferenceYear'].ge(2000)]['ReferenceYear'], bins=np.arange(2000, 2020, 1))
print('average number of publication since 2000: {:.1f} publications/year'.format(np.average(count)))
average number of publication since 2000: 6.7 publications/year
# how many publications and entries after 2012
ie = dfm['ReferenceYear'].gt(2012)
print('{:d} new entries from {:d} new publications after 2012'.format(
    ie.sum(), dfm[ie].groupby('ReferenceName').first().shape[0]))
565 new entries from 48 new publications after 2012

#Descriptive statistics of features

features = {
    'LanduseClass': ['Land use'],
    'ClayContent': ['Clay content [kg.kg$^{-1}$]', 0, 1],
    'SiltContent': ['Silt content [kg.kg$^{-1}$]', 0, 1],
    'SandContent': ['Sand content [kg.kg$^{-1}$]', 0, 1],
    'TillageClass': ['Tillage'],
    'BulkDensity': ['Bulk density [g.cm$^{-3}$]', 0.5, 2.0],
    'SoilOrganicCarbon': ['Soil organic carbon [kg.kg$^{-1}$]', 0, 0.05],
    'AnnualPrecipitation': ['Annual precipitation [mm]', 0, 2000],
    'CompactionClass': ['Soil compaction'],
    'AnnualMeanTemperature': ['Annual mean temperature [degC]', 0, 30],
    'AverageAridityIndex': ['Average aridity index [-]', 0, 1],
    'PrecipitationSeasonality': ['Precipitation seasonality (CV) [-]', 0, 140],  # coefficient of variation
    'SamplingTimeClass': ['Sampling time'],
    'MeanDiurnalRange': ['Mean diurnal range [degC]', 0, 20],
    #'NbOfCropRotation': ['Number of crop rotation'],
    #'ResidueClass': ['Residues'],
    #'GrazingClass': ['Grazing'],
    #'IrrigationClass': ['Irrigation'],

}

fig, axs = plt.subplots(4, 4, figsize=(12, 10))
axs = axs.flatten()
for i, key in enumerate(features):
    ax = axs[i]
    data = features[key]
    ax.set_title(data[0], fontsize=12)
    if len(data) > 1:
        ax.hist(dfm[i2keep][key], bins=np.linspace(data[1], data[2], 20))
    else:
        dfm[i2keep][key].replace({'unknown': np.nan,
                          -9999: np.nan,
                          0: np.nan} # for the 0 cover crops
                        ).dropna().value_counts().plot.barh(ax=ax)
    ax.annotate('(' + letters[i] + ')', (0.8, 0.85), xycoords='axes fraction')
    if i % 4 == 1:
        ax.set_ylabel('Freq.')
[axs[j].remove() for j in range(i+1, len(axs))]
fig.tight_layout()
fig.savefig(outputdir + 'eda-hist.jpg', dpi=500)
<Figure size 864x720 with 14 Axes>
# table with features statistics
dffeat = pd.DataFrame([
    ['Soil', 'SandContent', 'Sand content', 'kg.kg-1', '-'],
    ['Soil', 'SiltContent', 'Silt content', 'kg.kg-1', '-'],
    ['Soil', 'ClayContent', 'Clay content', 'kg.kg-1', '-'],
    ['Soil', 'BulkDensity', 'Bulk density', 'g.cm-3', '-'],
    ['Soil', 'SoilOrganicCarbon', 'Soil organic carbon', 'kg.kg-1', '-'],
    ['Climate', 'AnnualMeanTemperature', 'Annual mean temperature', '°C', '-'],
    ['Climate', 'AnnualPrecipitation', 'Annual mean precipitation', 'mm', '-'],
    ['Climate', 'AverageAridityIndex', 'Average aridity index', '-', '-'],
    ['Climate', 'PrecipitationSeasonality', 'Precipitation seasonality (CV)', '-', '-'],
    ['Climate', 'MeanDiurnalRange', 'Mean diurnal range', 'degC', '-'],
    ['Management', 'LanduseClass', 'Land use', 'choices', ''],
    ['Management', 'TillageClass', 'Tillage', 'choices', ''],
    #['Management', 'NbOfCropRotation', 'Number of crop rotation', 'choices', ''],
    #['Management', 'ResidueClass', 'Residues', 'choices', ''],
    #['Management', 'GrazingClass', 'Grazing', 'choices', ''],
    #['Management', 'IrrigationClass', 'Irrigation', 'choices', ''],
    ['Management', 'CompactionClass', 'Soil compaction', 'choices', ''],
    ['Management', 'SamplingTimeClass', 'Sampling time', 'choices', '']
], columns=['Type', 'col', 'Predictor', 'Unit', 'Range/Choices'])
dffeat['Number of entries'] = 0
dffeat['Number of gaps'] = 0
for i in range(dffeat.shape[0]):
    col = dffeat.loc[i, 'col']
    s = dfm[col].replace(-9999, np.nan).replace('unknown', np.nan).replace(0, np.nan)
    s2 = dfm[i2keep][col].replace(-9999, np.nan).replace('unknown', np.nan).replace(0, np.nan)
    dffeat.loc[i, 'Number of entries'] = '{:d} ({:d})'.format(s2.notnull().sum(), s.notnull().sum())
    dffeat.loc[i, 'Number of gaps'] = '{:d} ({:d})'.format(s2.isna().sum(), s.isna().sum())
    if dffeat.loc[i, 'Range/Choices'] == '':
        choices = sorted(s.dropna().unique())
        if type(choices[0]) != str:
            choices = ['{:.0f}'.format(a) for a in choices]
        dffeat.loc[i, 'Range/Choices'] = ', '.join(choices)
    elif dffeat.loc[i, 'Range/Choices'] == '-':
        dffeat.loc[i, 'Range/Choices'] = '{:.1f} -> {:.1f} ({:.1f} -> {:.1f})'.format(
            s2.min(), s2.max(), s.min(), s.max())
    # mean and sd?
dffeat = dffeat.drop('col', axis=1)
dffeat.style.hide_index()

#Apply the filtering

The following figures will use the filtered data averaged according to unique combination of land use, tillage, compaction and sampling time per reference. This approach is similar to the one used in the meta-analysis.

# plot the average curve per range
# evolution of K per tension per tillage
gcol = 'Tension range'
dfm2 = dfm.copy()
def func(x):
    return np.round(x/10, 0)*10
dfm2['Tmin'] = dfm2['Tmin'].apply(func)
dfm2['Tmax'] = dfm2['Tmax'].apply(func)
dfm2.loc[dfm2['Tmax'].gt(100), 'Tmax'] = 100
dfm2['Tension range'] = np.nan
ranges = [[0, 100], [0, 60], [10, 100], [10, 60], [10, 40]]
labs = ['[{:2.0f} - {:3.0f}]'.format(a, b) for a, b in ranges]
for i, (a, b) in enumerate(ranges):
    dfm2.loc[dfm2['Tmin'].eq(a) & dfm2['Tmax'].eq(b), 'Tension range'] = labs[i]
dfg = dfm2.groupby(gcol)
dfmean = dfg.mean().reset_index()
dfsem = dfg.sem().reset_index()
dfcount = dfg.count().reset_index()
df = pd.merge(dfmean, dfsem, on=gcol, suffixes=('','_sem'))
df = pd.merge(df, dfcount, on=gcol, suffixes=('', '_count'))
cols = ['log_' + a for a in kcols]
#cols = kcols
semcols = [a + '_sem' for a in cols]
fig, ax = plt.subplots()
for val in labs:
    nref = dfm2.loc[dfm2[gcol] == val, 'ReferenceTag'].unique().shape[0]
    ie = df[gcol] == val
    cax = ax.errorbar(np.arange(0, 11)*10, df[ie][cols].values.flatten(),
                yerr=df[ie][semcols].values.flatten(),
                marker='.', label=val + ' ({:d} refs)'.format(nref))
    ax.fill_between(np.arange(0, 11)*10,
                   df[ie][cols].values.flatten() - df[ie][semcols].values.flatten(),
                   df[ie][cols].values.flatten() + df[ie][semcols].values.flatten(),
                   alpha=0.2, color=cax.get_children()[0].get_color())
ax.legend()
#ax.set_yscale('log')
ax.set_title('Tension range')
ax.set_ylim([0, None])
ax.set_xlabel('Tension [mm]')
ax.set_ylabel('log$_{10}$(K(h)) [mm.h$^{-1}$]')
axh = fig.add_axes([0.18, 0.15, 0.2, 0.2])
for i, c in enumerate(labs):
    sc = dfm2[gcol].eq(c).sum()
    axh.bar(i, sc)
    axh.text(i, sc, '{:d}'.format(sc), ha='center', va='bottom', fontsize=8)
axh.grid(False)
axh.spines['top'].set_visible(False)
axh.spines['right'].set_visible(False)
axh.set_yticks([])
axh.set_xticks([])
axh.set_ylabel('Nb. entries')
axh.patch.set_alpha(0)  # transparent background

fig.savefig(outputdir + 'k-' + gcol + '.jpg', dpi=300)
<Figure size 432x288 with 2 Axes>
# are these differences correlated with other factors such as aridity index or others?
# not weighted correlated
variables = [
    'AnnualMeanTemperature',             
    'MaxTemperatureofWarmestMonth',
    'TemperatureAnnualRange',
    'MeanDiurnalRange',  
    'AnnualPrecipitation',
    'PrecipitationofDriestQuarter',
    'PrecipitationSeasonality',
    'AverageAnnualEvapoTranspiration',
    'AverageAridityIndex',
    'Elevation',
    'ClayContent',
    'SiltContent',
    'SandContent',
    'BulkDensity',
    'SoilOrganicCarbon'
]

def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    groups = [m.group(0) for m in matches]
    gs = []
    for a in groups:
        if a[-2:] == 'of':
            gs.append(a[:-2])
            gs.append('of')
        else:
            gs.append(a)
    return ' '.join(gs).capitalize()

dfcorr = pd.DataFrame(columns=variables)
dfcorr['range'] = labs

for i, val in enumerate(labs):  # for each measurement range
    ie = dfm2[gcol].eq(val)
    # compute correlation to pedo-climatic variables
    for variable in variables:
        x = dfm2[ie]['K3'].values
        # NOTE we choose K3 as it's a common value for all ranges (K10 is only know for some ranges)
        y = dfm2[ie][variable].values
        inan = ~np.isnan(y) & ~np.isnan(x)
        corr, pval = stats.spearmanr(x[inan], y[inan])
        dfcorr.loc[i, variable] = corr

fig, ax = plt.subplots(figsize=(5, 8))
sns.heatmap(dfcorr[dfcorr.columns[:-1]].T.fillna(0),
            vmin=-1, vmax=1, cmap='bwr', ax=ax,
            annot=True, fmt='1.2f',
            yticklabels=[camel_case_split(a) for a in variables])
ax.set_xticklabels(labs, rotation=90);
<Figure size 360x576 with 2 Axes>
# apply filtering (run ONCE)
dfm = dfm[i2keep].reset_index(drop=True)
# creating weight (max weight per study = 1, weight inside study split according to replicates)
dfm['Reps'] = dfm['Reps'].fillna(1)
dfm['w_eda'] = np.nan
for ref in dfm['ReferenceTag'].unique():
    ie = dfm['ReferenceTag'].eq(ref)
    nbReplicates = dfm[ie]['Reps'].sum()
    dfm.loc[ie, 'w_eda'] = dfm[ie]['Reps']*1/np.sqrt(nbReplicates)

#Correlation

# correlation matrix of K values
xlabs = ['log_' + a for a in kcols]
tensions = ['s','1','2','3','4','5','6','7','8','9','10']
knames = ['$K_{' + a + '0''}$' for a in tensions]
knames[0] = '$K_{s}$'
#ie = pd.Series(factors + xlabs).isin(dfm.columns.tolist())
ie = pd.Series(xlabs).isin(dfm.columns.tolist())
#dfcorr = dfm[np.array(factors + xlabs)[ie]].corr(method='spearman')
dfcorr = dfm[np.array(xlabs)[ie]].corr(method='spearman')
fig, ax = plt.subplots(figsize=(4, 3))
sns.heatmap(dfcorr.values, ax=ax, cmap='RdBu_r', xticklabels=knames,
            yticklabels=knames, vmin=-1, vmax=1,
            cbar_kws={'label': 'Spearman Rank Correlation'})
fig.savefig(outputdir + 'K-correlation.jpg', dpi=500)
<Figure size 288x216 with 2 Axes>
# correlation matrix with all variables
xlabs = ['log_' + a for a in kcols]
ie = pd.Series(factors + xlabs).isin(dfm.columns.tolist())
dfcorr = dfm[np.array(factors + xlabs)[ie]].corr(method='pearson')
fig, ax = plt.subplots(figsize=(14, 10))
sns.heatmap(dfcorr.values, ax=ax, cmap='bwr', xticklabels=dfcorr.columns.values,
            yticklabels=dfcorr.columns.values, vmin=-1, vmax=1,
            cbar_kws={'label': 'Spearman Rank Correlation'})
fig.savefig(outputdir + 'K-correlation.jpg', dpi=500)
<Figure size 1008x720 with 2 Axes>
# useful function for weighted rank correlation
def _weightedRankMean(sdf):
    """Weighted Mean"""
    dfr = sdf[cols].rank().reset_index()
    dfwm = (dfr[cols].mul(sdf['w_eda'], axis=0)).sum()/sdf['w_eda'].sum()
    return dfwm

def _weightedMean(x, w):
    """Weighted Mean"""
    return np.sum(x * w) / np.sum(w)

def _weightedCov(x, y, w):
    """Weighted Covariance"""
    return np.sum(w * (x - _weightedMean(x, w)) * (y - _weightedMean(y, w))) / np.sum(w)

def _weightedCorr(ic, jc, w):
    myWCov = _weightedCov(ic, jc, w)
    myWSd1 = _weightedCov(ic, ic, w)
    myWSd2 = _weightedCov(jc, jc, w)
    myWCorr  = myWCov / np.sqrt(myWSd1 * myWSd2)
    return myWCorr

def _weightedRankCorr(sdf, weight):
    """Weighted Covariance"""

    min_periods = 1
    dfr = sdf.rank()   
    K = sdf.shape[1]
    myWCorr = np.zeros((K, K), dtype=float)
    mask = np.isfinite(dfr)

    w = weight

    for i, ic in enumerate(dfr.columns):
        for j, jc in enumerate(dfr.columns):

            x = dfr[ic]
            y = dfr[jc]

            myWCorr[i, j] = _weightedCorr(x, y, w)
            
    wCorr = pd.DataFrame(data=myWCorr, index=sdf.columns, columns=sdf.columns)

    return wCorr

def _calcConfidence4WeightedRankCorr(dfr, myWeightedCorr, randomizos):
    
    #the confidence interval is calculated based on a randomization test. the latter create a PDF of correlations expected for the data with expected value of 0. See also permutation test on wikipedia    
    K = dfr.shape[1]
    myConfCount = np.zeros((K-1, K-1), dtype=float)   
        
    #calculate rank correlation test    
    for k in range(randomizos):
                  
        #create dataframe 1 and 2 with randomized rows
        dfs1 = shuffle(dfr).reset_index(drop=True)        
        dfs2 = shuffle(dfr).reset_index(drop=True)        
    
        w = (dfs1['w_eda'] + dfs2['w_eda']) / 2
        dfs1 = dfs1.drop(columns = ['w_eda'])
        dfs2 = dfs2.drop(columns = ['w_eda'])
        
        for i, ic in enumerate(dfs1.columns):
            for j, jc in enumerate(dfs2.columns):

                x = dfs1[ic]
                y = dfs2[jc]

                if abs(_weightedCorr(x, y, w)) > abs(myWeightedCorr.iloc[i,j]):
                    myConfCount[i,j] = myConfCount[i,j] + 1        
                    
    pvals = myConfCount / randomizos / 2 #take into account that there are two tails..  

    dfout = pd.DataFrame(data=pvals, index=dfs1.columns, columns=dfs2.columns)

    return dfout

def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    groups = [m.group(0) for m in matches]
    gs = []
    for a in groups:
        if a[-2:] == 'of':
            gs.append(a[:-2])
            gs.append('of')
        else:
            gs.append(a)
    return ' '.join(gs).capitalize()

def shadepval(dfm, dfcorr, pval=0.05, randomizos=1):
    """Compute weighted pvalue for Spearman rank correlation.
    
    Parameters
    ----------
    dfm2 : pandas.DataFrame
        Initial dataframe with raw data for selected columns.
    dfcorr : pandas.DataFrame
        Correlation matrix corresponding.
    pval : float, optional
        p-value threshold under with the correlationg is considered significant.
    randomizos : int, optional
        Number of randomized realization of dataframe to calculate the signicance level.
        
    Returns
    -------
    dflab : pandas.DataFrame
        Dataframe similar to dfcorr but all dtypes as string. Non-significant results are empty strings.
    dfpval : pandas.DataFrame
        Dataframe with computed p-values.
    """
    # compute pvalue and text to put on cell (if pvalue < threshold)
    #number of randomized realizations of dataframe to calculate the significance level
    dfpval = _calcConfidence4WeightedRankCorr(dfm, dfcorr, randomizos)  #calculate significance  .. take a loooooong time
    corr = dfcorr.copy().values
    corr[dfpval.values > pval] = np.nan  # shading all corr that are below 0.05 pval
    def func(x):
        if pd.isna(x):
            return ''
        else:
            return '{:0.2f}'.format(x)
    dflab = pd.DataFrame(corr, columns=dfpval.columns)
    for col in dflab.columns:
        dflab[col] = dflab[col].apply(func)
    return dflab, dfpval
# just for climatic variables

def showCorr(dfm, xcols, ycols, ax=None, method='spearman',
             weighted=True, pshadow=True, pval=0.05, randomizos=1):
    """Display correlation matrix.
    """
    # make a copy of the dataframe (for safety)
    dfm2 = dfm.copy()
    
    # which label
    if method == 'spearman':
        label = 'Spearman Rank Correlation'
    elif method == 'pearson':
        label = 'Pearson Correlation'
    else:
        label = ''
    
    # define columns names and associated labels
    dic = {
        'AnnualMeanTemperature': 'Mean annual temperature',             
        'MeanTemperatureofWarmestQuarter': 'Mean temperatue of warmest quarter',
        'MaxTemperatureofWarmestMonth': 'Max temperature of warmest month',
        'MeanTemperatureofColdestQuarter': 'Mean temperature of coldest quarter',
        'MinTemperatureofColdestMonth': 'Minimum temperature of coldest month',
        'MeanTemperatureofWettestQuarter': 'Mean temperature of wettest quarter',
        'MeanTemperatureofDriestQuarter': 'Mean temperature of driest quarter',
        'TemperatureSeasonality': 'Temperature seasonality',
        'TemperatureAnnualRange': 'Temperature annaul range',
        'MeanDiurnalRange': 'Mean diurnal temperature',
        'Isothermality': 'Isothermality',
        'AnnualPrecipitation': 'Annual precipitation',
        'PrecipitationofWarmestQuarter': 'Precipitation of warmest quarter',
        'PrecipitationofColdestQuarter': 'Precipitation of coldest quarter',
        'PrecipitationofWettestQuarter': 'Precipitation of wettest quarter',
        'PrecipitationofWettestMonth': 'Precipitation of wettest month',
        'PrecipitationofDriestQuarter': 'Precipitation of driest quarter',
        'PrecipitationofDriestMonth': 'Precipitation of driest month',
        'PrecipitationSeasonality': 'Precipitation seasonality',
        'AverageAnnualEvapoTranspiration': 'Average annual evapotranspiration',
        'AverageAridityIndex': 'Average aridity index',
        'Elevation': 'Elevation',
        'Latitude': 'Latitude',
        'ClayContent': 'Clay content',
        'SiltContent': 'Silt content',
        'SandContent': 'Sand content',
        'BulkDensity': 'Bulk density',
        'SoilOrganicCarbon': 'Soil organic carbon',
        'log_Ks': '$K_s$',
        'log_K1': '$K_{10}$',
        'log_K2': '$K_{20}$',
        'log_K3': '$K_{30}$',
        'log_K4': '$K_{40}$',
        'log_K5': '$K_{50}$',
        'log_K6': '$K_{60}$',
        'log_K7': '$K_{70}$',
        'log_K8': '$K_{80}$',
        'log_K9': '$K_{90}$',
        'log_K10': '$K_{100}$',
    }
    def getLabel(x):
        if x in dic:
            return dic[x]
        else:
            return camel_case_split(x)
    
    # create correlation matrix
    cols = np.unique(ycols + xcols).tolist()
    if weighted:
        if method == 'spearman':
            dfcorr = _weightedRankCorr(dfm2[cols], dfm2['w_eda'])
        else:
            print('only method "spearman" is avaible with weighting')
            return
    else:
        dfcorr = dfm2[cols].corr(method=method)

    # create subset of correlation matrix
    ivar = np.zeros(dfcorr.shape[0], dtype=bool)
    ivar[:len(ycols)] = True
    
    # compute pvalues
    if pshadow:
        dflab, dfpval = shadepval(dfm2[cols + ['w_eda']],
                                  dfcorr, pval=pval, randomizos=randomizos)

    # figure
    flag = False
    if ax is None:
        fig, ax = plt.subplots()
        flag = True
        
    # if we have square matrix then we only show the lower triangle
    if np.in1d(np.array(xcols), np.array(ycols)).all():
        mask = np.zeros((ivar.sum(), len(xcols)), dtype=bool)
        mask[np.triu_indices_from(mask)] = True
    else:
        mask = np.zeros((ivar.sum(), len(xcols)), dtype=bool)

    sns.heatmap(dfcorr[ivar][xcols].values, mask=mask, ax=ax, cmap='bwr',
                yticklabels=[getLabel(a) for a in ycols],
                xticklabels=[getLabel(a) for a in xcols],
                vmin=-1, vmax=1, annot=dflab[ivar][xcols].values, fmt='s',
                cbar_kws={'label': label})
    if flag:
        return fig

xcols = ['log_Ks', 'log_K1', 'log_K2', 'log_K3', 'log_K4', 'log_K5',
         'log_K6', 'log_K7', 'log_K8', 'log_K9', 'log_K10']
ycols = [
    'AnnualMeanTemperature',             
    #'MeanTemperatureofWarmestQuarter',
    'MaxTemperatureofWarmestMonth',
    #'MeanTemperatureofColdestQuarter',
    #'MinTemperatureofColdestMonth',
    #'MeanTemperatureofWettestQuarter',
    #'MeanTemperatureofDriestQuarter',
    #'TemperatureSeasonality',
    'TemperatureAnnualRange',
    'MeanDiurnalRange',
    #'Isothermality',
    'AnnualPrecipitation',
    #'PrecipitationofWarmestQuarter',
    #'PrecipitationofColdestQuarter',
    #'PrecipitationofWettestQuarter',
    #'PrecipitationofWettestMonth',
    'PrecipitationofDriestQuarter',
    #'PrecipitationofDriestMonth',
    'PrecipitationSeasonality',
    'AverageAnnualEvapoTranspiration',
    'AverageAridityIndex',
    'Elevation',
    'Latitude',
    'ClayContent',
    'SiltContent',
    'SandContent',
    'BulkDensity',
    'SoilOrganicCarbon'
]

fig, ax = plt.subplots(figsize=(10, 7))
showCorr(dfm, xcols, ycols, ax=ax, pshadow=True, pval=0.05, randomizos=1)
fig.tight_layout()
fig.subplots_adjust(right=1.2)
fig.savefig(outputdir + 'eda-correlation-climate.jpg', dpi=500, bbox_inches='tight')
<Figure size 720x504 with 2 Axes>
# calculate significance (takes a while!)
# randomizos = 50  #number of randomized realizations of dataframe to calculate the significance level
# pvalue = _calcConfidence4WeightedRankCorr(dfm2[variables + xlabs + ['w_eda']], dfcorr, randomizos)  #calculate significance  .. take a loooooong time

# fig, ax = plt.subplots(figsize=(12, 9))
# sns.set(font_scale = 1.8)
# sns.heatmap(pvalue[ivar][xlabs].values, ax=ax, cmap='bwr',
#             yticklabels=[camel_case_split(a) for a in vnames],
#             xticklabels=knames, vmin=0, vmax=1, annot=True, fmt='1.2f',
#             annot_kws={"size": 14},
#             cbar_kws={'label': 'p-value for Spearman Rank Correlation'})
# fig.tight_layout()
# fig.subplots_adjust(right=1.2)
# fig.savefig(outputdir + 'eda-correlation-climate-pvalues_n' + str(randomizos) + '.jpg', dpi=500, bbox_inches='tight')
xcols = ['log_Ks', 'log_K1', 'log_K2', 'log_K3', 'log_K4', 'log_K5',
         'log_K6', 'log_K7', 'log_K8', 'log_K9', 'log_K10']
ycols = [
    'ClayContent',
    'SiltContent',
    'SandContent',
    'BulkDensity',
    'SoilOrganicCarbon'
]

fig, ax = plt.subplots(figsize=(8, 4))
showCorr(dfm, xcols, ycols, ax=ax, pshadow=True, pval=0.05, randomizos=1)
fig.tight_layout()
fig.subplots_adjust(right=1.2)
fig.savefig(outputdir + 'eda-correlation-soil-k.jpg', dpi=500, bbox_inches='tight')
<Figure size 576x288 with 2 Axes>
# just for climatic variables
ycols = [
    'AnnualMeanTemperature',             
    #'MeanTemperatureofWarmestQuarter',
    'MaxTemperatureofWarmestMonth',
    #'MeanTemperatureofColdestQuarter',
    #'MinTemperatureofColdestMonth',
    #'MeanTemperatureofWettestQuarter',
    #'MeanTemperatureofDriestQuarter',
    #'TemperatureSeasonality',
    'TemperatureAnnualRange',
    'MeanDiurnalRange',  
    #'Isothermality',           
    'AnnualPrecipitation',
    #'PrecipitationofWarmestQuarter',
    #'PrecipitationofColdestQuarter',
    #'PrecipitationofWettestQuarter',
    #'PrecipitationofWettestMonth',
    'PrecipitationofDriestQuarter',
    #'PrecipitationofDriestMonth',
    'PrecipitationSeasonality',
    'AverageAnnualEvapoTranspiration',
    'AverageAridityIndex',
    'Elevation',
    'ClayContent',
    'SiltContent',
    'SandContent',
    'BulkDensity',
    'SoilOrganicCarbon'
]

fig, ax = plt.subplots(figsize=(12, 10))
showCorr(dfm, ycols, ycols, ax=ax, pshadow=True, pval=0.05, randomizos=1)
fig.tight_layout()
fig.subplots_adjust(right=1)
fig.savefig(outputdir + 'eda-correlation-climclim.jpg', dpi=500, bbox_inches='tight')
<Figure size 864x720 with 2 Axes>
# just for climatic variables
xcols = [
    'ClayContent',
    'SiltContent',
    'SandContent',
    'BulkDensity',
    'SoilOrganicCarbon'
]

ycols = [
    'AnnualMeanTemperature',             
    #'MeanTemperatureofWarmestQuarter',
    'MaxTemperatureofWarmestMonth',
    #'MeanTemperatureofColdestQuarter',
    #'MinTemperatureofColdestMonth',
    #'MeanTemperatureofWettestQuarter',
    #'MeanTemperatureofDriestQuarter',
    #'TemperatureSeasonality',
    'TemperatureAnnualRange',
    'MeanDiurnalRange',  
    #'Isothermality',           
    'AnnualPrecipitation',
    #'PrecipitationofWarmestQuarter',
    #'PrecipitationofColdestQuarter',
    #'PrecipitationofWettestQuarter',
    #'PrecipitationofWettestMonth',
    'PrecipitationofDriestQuarter',
    #'PrecipitationofDriestMonth',
    'PrecipitationSeasonality',
    'AverageAnnualEvapoTranspiration',
    'AverageAridityIndex',
    'Elevation'
]

# figure
fig, ax = plt.subplots(figsize=(8, 6))
showCorr(dfm, xcols, ycols, ax=ax)           
fig.tight_layout()
fig.subplots_adjust(right=0.9)
fig.savefig(outputdir + 'eda-correlation-soilclimate.jpg', dpi=500, bbox_inches='tight')
<Figure size 576x432 with 2 Axes>
# correlation matrix per categories

dfm2=dfm.copy()

ycols = ['AnnualMeanTemperature',             
#'MeanTemperatureofWarmestQuarter',
'MaxTemperatureofWarmestMonth',
#'MeanTemperatureofColdestQuarter',
#'MinTemperatureofColdestMonth',
#'MeanTemperatureofWettestQuarter',
#'MeanTemperatureofDriestQuarter',
#'TemperatureSeasonality',
'TemperatureAnnualRange',
'MeanDiurnalRange',  
#'Isothermality',           
'AnnualPrecipitation',
#'PrecipitationofWarmestQuarter',
#'PrecipitationofColdestQuarter',
#'PrecipitationofWettestQuarter',
#'PrecipitationofWettestMonth',
'PrecipitationofDriestQuarter',
#'PrecipitationofDriestMonth',
'PrecipitationSeasonality',
'AverageAnnualEvapoTranspiration',
'AverageAridityIndex',
'Elevation',
'ClayContent',
'SiltContent',
'SandContent',
'BulkDensity',
'SoilOrganicCarbon']

# creating additional classes
ohLU = np.zeros(dfm2.shape[1], dtype=bool)
ohLU = dfm2['LanduseClass']
dfm2['LandUseArable'] = ohLU == 'arable'
dfm2['LandUseGrassland'] = ohLU == 'grassland'
dfm2['LandUseForest'] = ohLU == 'woodland/plantation'

ohLU = dfm2['TillageClass']
dfm2['ConventionalTillage'] = ohLU == 'conventional tillage'
dfm2['ReducedTillage'] = ohLU == 'reduced tillage'
dfm2['NoTillage'] = ohLU == 'no tillage' 

ohLU = dfm2['CompactionClass']
dfm2['Compacted'] = ohLU == 'compacted'

ohLU = dfm2['SamplingTimeClass']
dfm2['FreshlyTilled'] = ohLU == 'after tillage'

#dfm2['ConventionalTillage'].replace(np.NaN, False)
#dfm2['ReducedTillage'].replace(np.NaN, False)
#dfm2['NoTillage'].replace(np.NaN, False)
xcols = ['log_Ks', 'log_K1', 'log_K2', 'log_K3', 'log_K4', 'log_K5',
         'log_K6', 'log_K7', 'log_K8', 'log_K9', 'log_K10']

ycols = [
    'LandUseArable',
    'LandUseGrassland',
    'LandUseForest', 
    'ConventionalTillage',
    'ReducedTillage',
    'NoTillage',
    'Compacted',
    'FreshlyTilled'
]

# figure
fig, ax = plt.subplots(figsize=(10, 6))
showCorr(dfm2, xcols, ycols, ax=ax)
fig.tight_layout()
fig.subplots_adjust(right=0.9)
fig.savefig(outputdir + 'eda-correlation-climateLU.jpg', dpi=500, bbox_inches='tight')
<Figure size 720x432 with 2 Axes>
# correlation matrix soil variables per practices

dfm2=dfm.copy()


ohLU = np.zeros(dfm2.shape[1], dtype=bool)
ohLU = dfm2['LanduseClass']
dfm2['LandUseArable'] = ohLU == 'arable'
dfm2['LandUseGrassland'] = ohLU == 'grassland'
dfm2['LandUseForest'] = ohLU == 'woodland/plantation'

ohLU = dfm2['TillageClass']
dfm2['ConventionalTillage'] = ohLU == 'conventional tillage'
dfm2['ReducedTillage'] = ohLU == 'reduced tillage'
dfm2['NoTillage'] = ohLU == 'no tillage' 

ohLU = dfm2['CompactionClass']
dfm2['Compacted'] = ohLU == 'compacted'

ohLU = dfm2['SamplingTimeClass']
dfm2['FreshlyTilled'] = ohLU == 'after tillage'

xcols = [
    'ClayContent',
    'SiltContent',
    'SandContent',
    'BulkDensity',
    'SoilOrganicCarbon'
]

ycols = [
    'LandUseArable',
    'LandUseGrassland',
    'LandUseForest',
    'ConventionalTillage',
    'ReducedTillage',
    'NoTillage',
    'Compacted',
    'FreshlyTilled'
]

# figure
fig, ax = plt.subplots(figsize=(6, 6))
showCorr(dfm2, xcols, ycols, ax=ax)
fig.tight_layout()
fig.subplots_adjust(right=0.9)
fig.savefig(outputdir + 'eda-correlation-soilLU.jpg', dpi=500, bbox_inches='tight')
<Figure size 432x432 with 2 Axes>
# correlation matrix land use vs k
dfm2=dfm.copy()

ohLU = np.zeros(dfm2.shape[1], dtype=bool)
ohLU = dfm2['LanduseClass']
dfm2['LandUseArable'] = ohLU == 'arable'
dfm2['LandUseGrassland'] = ohLU == 'grassland'
dfm2['LandUseForest'] = ohLU == 'woodland/plantation'

ohLU = dfm2['TillageClass']
dfm2['ConventionalTillage'] = ohLU == 'conventional tillage'
dfm2['ReducedTillage'] = ohLU == 'reduced tillage'
dfm2['NoTillage'] = ohLU == 'no tillage' 

ohLU = dfm2['CompactionClass']
dfm2['Compacted'] = ohLU == 'compacted'

ohLU = dfm2['SamplingTimeClass']
dfm2['FreshlyTilled'] = ohLU == 'after tillage'


ycols = [
    'LandUseArable',
    'LandUseGrassland',
    'LandUseForest',
    'ConventionalTillage',
    'ReducedTillage',
    'NoTillage',
    'Compacted',
    'FreshlyTilled'
]

# figure
fig, ax = plt.subplots(figsize=(6, 6))
showCorr(dfm2, xcols, ycols, ax=ax)        
fig.tight_layout()
fig.subplots_adjust(right=0.9)
fig.savefig(outputdir + 'eda-correlation-LU_K.jpg', dpi=500, bbox_inches='tight')
<Figure size 432x432 with 2 Axes>

#Evolution with tensions

# define a subset (here we take all the filtered data)
ifull = np.ones(dfm.shape[0], dtype=bool)
# ifull = dfm['Tmin'].le(40) & dfm['Tmax'].ge(100)
print(np.sum(ifull), '/', ifull.shape[0], '({:.0f}%)'.format(
    np.sum(ifull)/ifull.shape[0]*100))
476 / 476 (100%)
# evolution of K per tension soil texture (and define all useful functions)
gcol = 'SoilTextureUSDA'
def func(x):
    dico = {
        np.nan: np.nan,
        'organic': np.nan,
        'clay': 'clay',
        'clay loam': 'loam',
        'loam': 'loam',
        'loamy sand': 'sand',
        'sand': 'sand',
        'sandy clay': 'clay',
        'sandy clay loam': 'loam',
        'sandy loam': 'loam',
        'silt loam': 'loam',
        'silty clay': 'loam',
        'silty clay loam': 'loam'
    }
    return dico[x]
dfm2 = dfm[ifull].copy()
dfm2[gcol] = dfm2[gcol].apply(func)

def _createDF(dfm2, gcol, weight=True):
    dfg = dfm2.groupby(gcol)
    if weight:
        def func(sdf):
            cols = sdf.columns[sdf.dtypes.ne('object')]
            return (sdf[cols].mul(sdf['w_eda'], axis=0)).sum()/sdf['w_eda'].sum()
        dfmean = dfg.apply(func).reset_index()
        def func(sdf):
            cols = sdf.columns[sdf.dtypes.ne('object')]
            return (sdf[cols].std()*np.sqrt(sdf['w_eda'].pow(2).sum()))/sdf['w_eda'].sum()
        dfsem = dfg.apply(func).reset_index()
    else:
        dfmean = dfg.mean().reset_index()
        dfsem = dfg.sem().reset_index()
    dfcount = dfg.count().reset_index()
    df = pd.merge(dfmean, dfsem, on=gcol, suffixes=('','_sem'))
    df = pd.merge(df, dfcount, on=gcol, suffixes=('', '_count'))
    return df

def plotKfig(dfm2, gcol, ax=None, title=None, lloc='best'):
    df = _createDF(dfm2, gcol)
    cols = ['log_' + a for a in kcols]
    semcols = [a + '_sem' for a in cols]
    if ax is None:
        fig, ax = plt.subplots()
    for val in df[gcol].unique():
        ie = df[gcol] == val
        cax = ax.errorbar(np.arange(0, 11)*10, df[ie][cols].values.flatten(),
                    yerr=df[ie][semcols].values.flatten(),
                    marker='.', label=val)
        ax.fill_between(np.arange(0, 11)*10,
                       df[ie][cols].values.flatten() - df[ie][semcols].values.flatten(),
                       df[ie][cols].values.flatten() + df[ie][semcols].values.flatten(),
                       alpha=0.2, color=cax.get_children()[0].get_color())
    ax.legend(loc=lloc, prop={"family": "Arial", "size":12})
    #ax.set_yscale('log')
    ax.set_title(title, fontsize=14)
    #matplotlib.rc('axes', labelsize=16, titlesize=13)
    for label in (ax.get_xticklabels() + ax.get_yticklabels()):
        label.set_fontname('Arial')
        label.set_fontsize(12)
    ax.set_ylim([0, 2.5])
    ax.set_xlabel('$h$ [mm]', fontname='Arial', fontsize=14)
    ax.set_ylabel('log$_{10}\:K_h$ [mm.h$^{-1}$]', fontname='Arial', fontsize=14)

def plotKbar(dfm2, gcol, fig, pos=[0.15, 0.15, 0.2, 0.2]):
    df = _createDF(dfm2, gcol)
    axh = fig.add_axes(pos)
    axh.patch.set_alpha(0.0)
    for i, c in enumerate(df[gcol].unique()):
        sc = dfm2[gcol].eq(c).sum()
        axh.bar(i, sc)
        axh.text(i, sc, '{:d}'.format(sc), ha='center', va='bottom', fontsize=12, fontname='Arial')
    axh.grid(False)
    axh.spines['top'].set_visible(False)
    axh.spines['right'].set_visible(False)
    axh.set_yticks([])
    axh.set_xticks([])
    #axh.set_ylabel('Nb. entries', fontsize=10)

fig, ax = plt.subplots()
plotKfig(dfm2, gcol, ax=ax, title='Soil Texture USDA')
plotKbar(dfm2, gcol, fig)
fig.savefig(outputdir + 'k-' + gcol + '.jpg', dpi=300)
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
<Figure size 432x288 with 2 Axes>
# subplots figure for climate (arid, precipitation, temperature)
fig, axs = plt.subplots(1, 3, figsize=(12, 3), sharex=True)

gcol = 'AridityClass'
dfm2 = dfm[ifull].copy()
dfm2[gcol] = dfm2[gcol].replace({'Hyper Arid': 'Arid', 'Semi-Arid': 'Arid'})
plotKfig(dfm2, gcol, ax=axs[0], title='(a) Aridity class')
plotKbar(dfm2, gcol, fig, pos=[0.08, 0.26, 0.1, 0.15])

gcol = 'PrecipitationSeasonality'
dfm2 = dfm[ifull].copy()
dfm2[gcol] = pd.cut(dfm2[gcol], bins=[0, 25, 55, 120])
plotKfig(dfm2, gcol, ax=axs[1], title='(b) Precipitation seasonality', lloc='upper right')
plotKbar(dfm2, gcol, fig, pos=[0.4, 0.26, 0.1, 0.15])

gcol = 'MeanDiurnalRange'
dfm2 = dfm[ifull].copy()
dfm2[gcol] = pd.cut(dfm2[gcol], bins=[0, 10, 12, 20])
plotKfig(dfm2, gcol, ax=axs[2], title='(c) Mean diurnal range [degC]')
plotKbar(dfm2, gcol, fig, pos=[0.72, 0.26, 0.1, 0.15])

axs[1].set_ylabel('')
axs[2].set_ylabel('')

fig.tight_layout()
fig.savefig(outputdir + 'k-climate.jpg', dpi=300)
<Figure size 864x216 with 6 Axes>
# subplots figure for method (diameter, direction, method, seasons)
fig, axs = plt.subplots(1, 3, figsize=(12, 3), sharex=True)

gcol = 'Diameter'
dfm2 = dfm[ifull].copy()
dfm2[gcol] = pd.cut(dfm2[gcol], bins=[0, 10, 20, 26])
plotKfig(dfm2, gcol, ax=axs[0], title='(a) Disk diameter [cm]')
plotKbar(dfm2, gcol, fig, pos=[0.08, 0.26, 0.1, 0.15])

gcol = 'Direction'
dfm2 = dfm[ifull].copy().replace({'unknown': np.nan})
plotKfig(dfm2, gcol, ax=axs[1], title='(b) Direction')
plotKbar(dfm2, gcol, fig, pos=[0.4, 0.26, 0.1, 0.15])

gcol = 'Method'
def func(x):
    dico = {
        np.nan: np.nan,
        'Steady-state constant': 'S-S. constant',
        'Steady-state piecewise': 'S-S. piecewise',
        'Steady-state multi-disc': 'S-S. multi-disk',
        'Steady-state constant  regression': 'S-S. constant',
        'Steady-state piecewise': 'S-S. piecewise',
        'Transient': 'Transient',
        'Other': np.nan,
        'Steady-state  multi-disc': 'S-S. multi-disc',
        '1D column': np.nan,
        np.nan: np.nan,
       'DISC software': np.nan,
       'steady-state constant regression': 'S-S. constant',
       'unclear - reference method': np.nan
    }
    return dico[x]
dfm2 = dfm[ifull].copy().replace({'unknown': np.nan})
dfm2[gcol] = dfm2[gcol].apply(func)
plotKfig(dfm2, gcol, ax=axs[2], title='(c) Method', lloc='upper right')
plotKbar(dfm2, gcol, fig, pos=[0.72, 0.26, 0.1, 0.15])

axs[1].set_ylabel('')
axs[2].set_ylabel('')

fig.tight_layout()
fig.savefig(outputdir + 'k-methods.jpg', dpi=300)
<Figure size 864x216 with 6 Axes>
# subplots figure for soil (texture FAO/USDA, BD, SOC)
fig, axs = plt.subplots(1, 3, figsize=(12, 3), sharex=True)
axs = axs.flatten()

gcol = 'SoilTextureUSDA'
def func(x):
    dico = {
        # np.nan: np.nan,
        # 'organic': np.nan,
        # 'clay': 'clay',
        # 'clay loam': 'loam',
        # 'loam': 'loam',
        # 'loamy sand': 'sand',
        # 'sand': 'sand',
        # 'sandy clay': 'clay',
        # 'sandy clay loam': 'loam',
        # 'sandy loam': 'loam',
        # 'silt loam': 'loam',
        # 'silty clay': 'loam',
        # 'silty clay loam': 'loam'
        np.nan: np.nan,
        'organic': np.nan,
        'clay': 'fine',
        'clay loam': 'fine',
        'loam': 'medium',
        'loamy sand': 'coarse',
        'sand': 'coarse',
        'sandy clay': 'coarse',
        'sandy clay loam': 'coarse',
        'sandy loam': 'coarse',
        'silt loam': 'medium',
        'silty clay': 'fine',
        'silty clay loam': 'fine'
    }
    return dico[x]
dfm2 = dfm[ifull].copy()
dfm2[gcol] = dfm2[gcol].apply(func)
plotKfig(dfm2, gcol, ax=axs[0], title='(a) Soil texture')
plotKbar(dfm2, gcol, fig, pos=[0.08, 0.26, 0.1, 0.15])

gcol = 'BulkDensity'
dfm2 = dfm[ifull].copy()
dfm2[gcol] = pd.cut(dfm2[gcol], bins=[0.5, 1.3, 1.4, 2.0])
plotKfig(dfm2, gcol, ax=axs[1], title='(b) Soil bulk density [g.cm$^{-3}$]')
plotKbar(dfm2, gcol, fig, pos=[0.4, 0.26, 0.1, 0.15])

gcol = 'SoilOrganicCarbon'
dfm2 = dfm[ifull].copy()
dfm2[gcol] = pd.cut(dfm2[gcol], bins=[0.0, 0.01, 0.02, 0.03, 0.1])
plotKfig(dfm2, gcol, ax=axs[2], title='(c) Soil organic carbon [kg.kg$^{-1}$]')
plotKbar(dfm2, gcol, fig, pos=[0.72, 0.26, 0.1, 0.15])


axs[1].set_ylabel('')
axs[2].set_ylabel('')
axs[0].set_ylim([0, 2])

fig.tight_layout()
fig.savefig(outputdir + 'k-soil.jpg', dpi=500)
<Figure size 864x216 with 6 Axes>