Natural language processing on primary studies selected from meta-analyses¶

This notebook explore natural language processing (NLP) techniques on the corpus of primary studies selected in the review of meta-analyses. It will explore:

topic modelling
relationship extraction

NOTE: this notebook needs the full-text papers to run. For copyright reasons, these are not provided with the notebooks.

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import pandas as pd
import time
import os
import shutil
import re
from tqdm.notebook import tqdm_notebook as tqdm
from lxml import etree, html
import requests
import requests_cache
import spacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy import displacy
import json
from pprint import pprint
import networkx as nx # drawing network graph
import unicodedata
from io import StringIO
from scipy import stats
nlp = spacy.load("en_core_web_sm")
import warnings
warnings.simplefilter('ignore')

datadir = '../data/from-meta/'
outputdir = datadir + '../../figures/'

dfscore = pd.DataFrame(columns={'step', 'n', 'recall', 'precision', 'f1', 'match'})

def camelCase(x):
    return x.lower()[0] + x.title().replace(' ','')[1:]

# gensim imports
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

import gensim
from gensim.models import Phrases
from gensim.models import LdaModel, LdaMulticore
from gensim.corpora import Dictionary
from gensim import models
from gensim import similarities
import pyLDAvis
from pyLDAvis import gensim_models as gensimvis

print('spacy', spacy.__version__)
print('gensim', gensim.__version__)
print('nltk', nltk.__version__)
print('pyLDAvis', pyLDAvis.__version__)

spacy 3.1.1
gensim 4.0.1
nltk 3.6.1
pyLDAvis 3.3.1

Preparing publications¶

# some statistics (start here)
dfpub = pd.read_csv(datadir + 'fromMeta.csv')
print('total entries:', dfpub.shape[0])
print('with doi:', dfpub['DOI'].notnull().sum())
print('retrieved:', dfpub['retrieved'].sum())
# we will only work with the ones which have full-text
dfpub = dfpub[dfpub['retrieved']].reset_index(drop=True)

total entries: 1964
with doi: 1964
retrieved: 1469

# creating rawdocs and extract text
rawdocs = []
c = 0
for fname in dfpub['fname']:
    # convert to text (only need to do it once)
    #os.system('pdftotext "' + datadir + 'papers/' + fname + '.pdf"')
    fpath = datadir + 'papers/' + fname + '.txt'
    if os.path.exists(fpath):
        with open(fpath, 'r', encoding='utf8') as f:
            try:
                text = f.read()
                if len(text) > 100:
                    rawdocs.append(text)
                else:
                    print('document too small', fname)
                c += 1
            except:
                print('error with', fname)
                rawdocs.append('')
    else:
        print('path not found for', fname, fpath) # pdf are empty
        rawdocs.append('')
print('imported:', c, '/', dfpub.shape[0])

imported: 1469 / 1469

Detecting abstract, references sections¶

Abstract is detected based on rules (specific keywords identfied, paragraph identified). The references section is also identified. Both abstract and references are removed from the main document, rawdocs is overwritten.

# detect abstract, keywords, reference, title - overwrite rawdocs!
abstracts = []
rawdocs = []
df = pd.DataFrame(columns=['fname', 'start', 'end', 'length', 'code'])
dfpub['abstract'] = pd.NA
for i, fname in tqdm(enumerate(dfpub.loc[:, 'fname']), total=dfpub.shape[0]):
    fpath = datadir + 'papers/' + str(fname) + '.txt'
    code = ''  # determine how the abstract was found
    if (pd.isna(fname) is False) & (os.path.exists(fpath)):
        with open(fpath, 'r', encoding='utf8') as f:
            doc = f.read()
        splitdoc = re.split('a b s t r a c t|abstract|ABSTRACT|Abstract|A B S T R A C T|Summary|s u m m a r y|SUMMARY', doc)
        foundStart = True
        if len(splitdoc) == 1:
            foundStart = False
        if len(' '.join(splitdoc[1:])) > 50:
            text = '\n'.join(splitdoc[1:]) if len(splitdoc) > 1 else splitdoc[0] # if not keyword abstract found, take all
        else:
            #print('take it all for', fname)
            text = doc

        # assuming first paragraph is the abstract
        splitEnd = re.split('Published|Introduction|Copyright|©|Keywords|keywords|KEYWORDS|KEY WORDS|Citation', text)
        foundEnd = True
        if len(splitEnd) == 1:
            foundEnd = False
            #print('abstract end not found', fname)
            paragraphs = text.split('\n\n')
            ps = []
            a = ''
            # we only want paragraphs split that are more than 10 characters
            for p in paragraphs:
                a = a + '\n\n' + p
                if len(a) > 10:
                    ps.append(a)
                    a = ''
            if foundStart is True:
                abstract = ps[0]
                code = 'ps[0]'
            else:
                # without detection of the start or end, we
                # blindly assume that abstract is first paragraph
                abstract = ps[1]
                code = 'first'
                #print('abstract = first paragraph', fname)
        else:
            abstract = ''
            for j, s in enumerate(splitEnd):
                if len(s) > 50 and abstract == '':
                    abstract = s
                    code = 's{:d}'.format(j)

        # cleaning up the abstract
        if abstract[0] in [':', '.']:
            abstract = abstract[1:]
        abstract = abstract.strip()

        # edge case (if we have two first large paragraphs)
        ts = abstract.split('\n\n')
        if len(ts) > 1:
            if (len(ts[0]) > 800) & (len(ts[1]) > 800):
                if ts[0][0] in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
                    abstract = ts[0]
                    #print(fname, 'first')
                    code = 'ts[0]'
                else:
                    abstract = ts[1]
                    #print(fname, 'second')
                    code = 'ts[1]'

        # write down abstract for manual assessment
        with open(fpath.replace('.txt', '-abstract.txt'), 'w', encoding='utf8') as f:
            f.write(abstract)
        df = df.append({'fname': fname, 'start': foundStart, 'end': 
                        foundEnd, 'length': len(abstract), 'code': code},
                       ignore_index=True)

        # add the abstract to dfpub
        dfpub.loc[i, 'abstract'] = abstract

        # remove the abstract from the body
        body = text.replace(abstract, '')

        # remove the references
        #if len(re.findall('r(é|e)f(é|e)rences?|r ?e ?f ?e ?r ?e ?n ?c ?e ?s?', body, flags=re.IGNORECASE)) == 0:
        #    print('no ref found for', fname)
        parts = re.split('\n\s?r(é|e)f(é|e)rences?\n|\n\s?r ?e ?f ?e ?r ?e ?n ?c ?e ?s?\n', body, flags=re.IGNORECASE)
        if len(parts) > 2:  # at least one 'reference' found
            body = '\nReferences\n'.join([a for a in parts[:-1] if a is not None])
        else:
            body = body
            #print('ref not found for', dfpub.loc[i, 'fname'])
            # for old papers with no ref section parsed to OCR, that's often the case
        # failsafe for badly detected abstract
        ratio = len(body) / len(text)
        if ratio < 0.6:
            #print('abstract probably not well detected for', fname, '(ratio: {:.2f})'.format(ratio))
            body = text
        rawdocs.append(body)
    else:
        print(fpath, 'not found')
print('number of abstract detected:', dfpub['abstract'].notnull().sum(), '/', dfpub.shape[0])

Parsing abstract from webpages¶

This approach is a bit more reliable than the pdf to text conversion because most of the strange symbols are kept in. Also when the pdf has two or more columns, it can happen that the abstract is not the first paragraph but rather the second.

# getting html page (run this once)
# from selenium import webdriver
# driver = webdriver.Firefox()
# for i in tqdm(range(0, dfpub.shape[0])):
#     doi = dfpub.loc[i, 'DOI']
#     fname = dfpub.loc[i, 'fname']
#     if (pd.isna(fname) is False) & (pd.isna(doi) is False):
#         # open each doi in a new tab
#         #webbrowser.open('https://doi.org/' + doi.strip())
#         # requests is not enough to get proper rediction, better use selenium
#         driver.get('http://doi.org/' + doi)
#         time.sleep(2)  # wait a bit for the page to be fully loaded
#         with open(datadir + 'webpages/' + fname + '.html', 'w', encoding='utf8') as f:
#             f.write(driver.page_source)
# driver.close()

# parsing html publication to extract abstract
abstracts = []
for i in tqdm(range(0, dfpub.shape[0])):
    doi = dfpub.loc[i, 'DOI']
    fname = dfpub.loc[i, 'fname']
    fpath = datadir + 'webpages/{:s}.html'.format(fname)
    if (pd.isna(fname) is False) & (pd.isna(doi) is False) & (os.path.exists(fpath)):
        #print(i, fname, 'https://doi.org/' + doi, end='')
        with open(fpath, 'r', encoding='utf8') as f:
            tree = html.parse(StringIO(f.read()))

        try:
            # parsing abstract
            htitle = tree.xpath("//h1[text()='Abstract'] | "
                                "//h2[text()='Abstract'] | "
                                "//h3[text()='Abstract'] | "
                                "//h4[text()='Abstract'] | "
                                "//h1[text()='ABSTRACT'] | "
                                "//h2[text()='ABSTRACT'] | "
                                "//h3[text()='ABSTRACT'] | "
                                "//h4[text()='ABSTRACT'] | "
                                "//h1[text()='Summary'] | "
                                "//h2[text()='Summary'] | "
                                "//h3[text()='Summary'] | "
                                "//h4[text()='Summary'] | "
                                "//h1[text()='Abstracts'] | "
                                "//h2[text()='Abstracts'] | "
                                "//h3[text()='Abstracts'] | "
                                "//h4[text()='Abstracts'] | "
                                "//strong[text()='Abstract.']"
                               )
            parent = None
            if len(htitle) > 0:
                parent = htitle[0].getparent()
                #print(' -> found', parent.tag)
            else:
                # try to find the p or div with the longest text
                c = 0
                for p in tree.xpath('//p'):
                    # detect only the text at first level, not nested
                    pt = ' '.join(p.xpath('.//text()[normalize-space()]'))
                    if len(pt) > c:
                        parent = p
                        c = len(pt)

                # the following is matching too many div
                for div in tree.xpath('//div'):
                    divt = ' '.join(div.xpath('./text()[normalize-space()]'))
                    if len(divt) > c and len(divt) < 2000:
                        parent = div
                        c = len(divt)
                #print(' -> inferred', parent.tag)#, parent, parent.attrib, c)

            xpath = './/text() | .//div/text() | .//div/p/text() | .//p/text() |' \
                    './/div/i/text() | .//div/p/i/text() | .//p/i/text() |' \
                    './/div/b/text() | .//div/p/b/text() | .//p/b/text() |' \
                    './/div/sup/text() | .//div/p/sup/text() | .//p/sup/text() |' \
                    './/div/sub/text() | .//div/p/sub/text() | .//p/sub/text() |' \
                    './/div/emph/text() | .//div/p/emph/text() | .//p/emph/text() |' \
                    './/div/span/text() | .//div/p/span/text() | .//p/span/text() |' \
                    './/div/span/span/text() | .//div/p/span/span/text() | .//p/span/span/text() |' \
                    './/div/span/span/span/text() | .//div/p/span/span/span/text() | .//p/span/span/span/text() |' \
                    './/div/span/span/a/text() | .//div/p/span/span/a/text() | .//p/span/span/a/text() |' \
                    './/div/span/a/text() | .//div/p/span/a/text() | .//p/span/a/text() |' \
                    './/div/em/text() | .//div/p/em/text() | .//p/em/text()'

            if parent is not None:
                #print(parent, parent.attrib)
                elms = parent.xpath(xpath)
                if len(' '.join(elms)) < 100:
                    gparent = parent.getparent()
                    #print('going for grandparents', gparent, gparent.attrib)
                    #print([(a, a.attrib) for a in gparent.xpath('.//div')])
                    elms = gparent.xpath('.//div')[1].xpath(xpath)
                abstract = ' '.join(elms)
            else:
                # try to find the p or div with the longest text
                abstract = ''
                for p in tree.xpath('//p'):
                    if len(p) > len(abstract):
                        abstract = p
                for div in tree.xpath('//div'):
                    if len(div) > len(abstract):
                        print(div.getparent())
                        abstract = div
                if abstract == '':
                    #print(' -> not found')
                    pass
                else:
                    pass
                    #print(' -> long p found')

            # cleaning up
            abstract = abstract.strip()
            abstract = re.sub('Abstracts?\.?[ \n\t]+', '', abstract)
            abstract = re.sub('ABSTRACTS?\.?[ \n\t]+', '', abstract)
            abstract = re.sub('Summary\.?[ \n\t]+', '', abstract)
            abstract = abstract.split('Keywords')[0]
            abstract = abstract.split('Key words')[0]
            abstract = abstract.split('Copyright')[0]
            abstract = abstract.split('©')[0]
            if len(re.split('\n\s+\n', abstract)) < 3:
                a = re.split('\n\s+\n', abstract)[0]
                if len(a) > 200:
                    abstract = a
            abstract = re.sub('[ \t\n]{2,}', '', abstract)
            abstract = abstract.strip()
            abstract = unicodedata.normalize("NFKD", abstract)
            #print('\n' + abstract + '\n')

            abstracts.append(abstract)
            dfpub.loc[i, 'abstract-web'] = abstract

        except Exception as e:
            print('-> FAILED with error' + str(e))

print('{:d}/{:d} abstracts found'.format(dfpub['abstract-web'].notnull().sum(), dfpub.shape[0]))

Topic classifcations with `gensim`¶

useful links:

LDAvis paper https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf
https://nlpforhackers.io/topic-modeling/

Coherence used is C $_V$ coherence of Röder et al. (2015).

# Tokenize the documents.
docs = []
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for doc in rawdocs:
    docs.append(tokenizer.tokenize(doc.lower()))

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only 3 characters.
docs = [[token for token in doc if len(token) > 3] for doc in docs]

# Remove Stop Words
stop_words = stopwords.words('english')
docs = [[word for word in doc if word not in stop_words] for doc in docs]

# Lemmatize the documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
# if 'soil' is followed by 'moisture' more than 20 times 'soil moisture' will be a bigram
bigram = Phrases(docs, min_count=20) 
docs2 = []
for idx in range(len(docs)):
    tmp = []
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)
            tmp.append(token)
    docs2.append(tmp)

# overwrite docs with docs2 (only contains bigrams)
docs = docs2

# filter unrelevant words

# reate a dictionary representation of the documents.
dictionary = Dictionary(docs)

# filter out words that occur in less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

# bag-of-words (bow) representation of the documents
# e.g. this word appears 5 times in the first document
corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: {:d}'.format(len(dictionary)))
print('Number of documents: {:d}'.format(len(corpus)))

Number of unique tokens: 2357
Number of documents: 1469

# Train LDA model (Latent Dirichlet Analysis)

num_topics = 6

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

# create the model
model = LdaModel(
    #workers=4,
    corpus=corpus,
    id2word=dictionary,
    #chunksize=1000,
    #alpha='auto',
    #eta='auto',
    #iterations=100,
    num_topics=num_topics,
    #passes=10,
    #eval_every=None  # don't evaluate model perplexity, takes too much time.
)

# train the model with the selected coherence metrics and keep the top N words
t0 = time.time()
#top_topics = model.top_topics(corpus, coherence='u_mass', topn=5)
top_topics = model.top_topics(texts=docs, coherence='c_v', dictionary=dictionary, topn=6)
print('elapsed: {:.2f}s'.format(time.time() - t0))

# average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

# probability of a word belonging to a topic
# model.get_term_topics('cover_crop')

# bow_list =['cover_crop', 'water_content']
# # convert to bag of words format first
# bow = model.id2word.doc2bow(bow_list)

# # interpreting the data
# doc_topics, word_topics, phi_values = model.get_document_topics(bow, per_word_topics = True)

pprint(top_topics)

elapsed: 9.67s
Average topic coherence: 0.4629.
[([(0.022423595, 'crop_residue'),
   (0.017042536, 'cropping_system'),
   (0.015680043, 'cover_crop'),
   (0.013967869, 'crop_rotation'),
   (0.009307228, 'microbial_biomass'),
   (0.009229824, 'winter_wheat')],
  0.5819526761463872),
 ([(0.040916074, 'cover_crop'),
   (0.01711834, 'conventional_tillage'),
   (0.01342745, 'water_content'),
   (0.01229791, 'cropping_system'),
   (0.010053033, 'crop_residue'),
   (0.00991174, 'grain_yield')],
  0.4957264323895248),
 ([(0.022167865, 'water_content'),
   (0.017181909, 'grain_yield'),
   (0.012952353, 'conventional_tillage'),
   (0.011155198, 'growing_season'),
   (0.010262345, 'cover_crop'),
   (0.007419842, 'cropping_system')],
  0.48013462416707475),
 ([(0.022890551, 'microbial_biomass'),
   (0.014957421, 'crop_residue'),
   (0.01094401, 'aggregate_size'),
   (0.010818942, 'conservation_tillage'),
   (0.009582708, 'meta_analysis'),
   (0.009033364, 'water_retention')],
  0.4173767887534317),
 ([(0.041067652, 'cover_crop'),
   (0.017838314, 'water_content'),
   (0.0138873905, 'cropping_system'),
   (0.0097843995, 'hairy_vetch'),
   (0.009302901, 'water_retention'),
   (0.009294609, 'conventional_tillage')],
  0.4168424600808034),
 ([(0.049234726, 'cover_crop'),
   (0.011948575, 'conventional_tillage'),
   (0.011156568, 'microbial_biomass'),
   (0.009945133, 'significant_difference'),
   (0.008677175, 'growing_season'),
   (0.0074653015, 'aggregate_stability')],
  0.3853531630668887)]

model.print_topics()

[(0,
  '0.041*"cover_crop" + 0.018*"water_content" + 0.014*"cropping_system" + 0.010*"hairy_vetch" + 0.009*"water_retention" + 0.009*"conventional_tillage" + 0.009*"management_practice" + 0.009*"aggregate_stability" + 0.008*"crop_rotation" + 0.008*"crop_residue"'),
 (1,
  '0.022*"crop_residue" + 0.017*"cropping_system" + 0.016*"cover_crop" + 0.014*"crop_rotation" + 0.009*"microbial_biomass" + 0.009*"winter_wheat" + 0.009*"aggregate_size" + 0.009*"aggregate_stability" + 0.008*"conventional_tillage" + 0.007*"growing_season"'),
 (2,
  '0.041*"cover_crop" + 0.017*"conventional_tillage" + 0.013*"water_content" + 0.012*"cropping_system" + 0.010*"crop_residue" + 0.010*"grain_yield" + 0.009*"biochar_application" + 0.008*"water_retention" + 0.007*"growing_season" + 0.007*"conservation_tillage"'),
 (3,
  '0.049*"cover_crop" + 0.012*"conventional_tillage" + 0.011*"microbial_biomass" + 0.010*"significant_difference" + 0.009*"growing_season" + 0.007*"aggregate_stability" + 0.007*"cropping_system" + 0.007*"crop_rotation" + 0.005*"infiltration_rate" + 0.005*"sewage_sludge"'),
 (4,
  '0.022*"water_content" + 0.017*"grain_yield" + 0.013*"conventional_tillage" + 0.011*"growing_season" + 0.010*"cover_crop" + 0.007*"cropping_system" + 0.007*"crop_residue" + 0.007*"microbial_biomass" + 0.006*"management_practice" + 0.006*"moisture_content"'),
 (5,
  '0.023*"microbial_biomass" + 0.015*"crop_residue" + 0.011*"aggregate_size" + 0.011*"conservation_tillage" + 0.010*"meta_analysis" + 0.009*"water_retention" + 0.008*"infiltration_rate" + 0.008*"aggregate_stability" + 0.008*"cover_crop" + 0.007*"water_content"')]

# show interactive vizualisation
pyLDAvis.disable_notebook()  # comment to show the interactive in the notebook
# https://nbviewer.org/github/bmabey/pyLDAvis/blob/master/notebooks/pyLDAvis_overview.ipynb#topic=0&lambda=1&term=
out = gensimvis.prepare(model, corpus, dictionary)
out

PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
5     -0.034185 -0.003713       1        1  19.673994
4     -0.003522  0.022248       2        1  17.970774
3      0.017644 -0.016386       3        1  17.386206
2      0.010941  0.017702       4        1  15.614775
0      0.018621 -0.005039       5        1  15.096809
1     -0.009500 -0.014812       6        1  14.257442, topic_info=                       Term         Freq        Total Category  logprob  \
288              cover_crop  9454.000000  9454.000000  Default  30.0000   
92             crop_residue  3848.000000  3848.000000  Default  29.0000   
1574            hairy_vetch   844.000000   844.000000  Default  28.0000   
61            water_content  4272.000000  4272.000000  Default  27.0000   
388             grain_yield  2583.000000  2583.000000  Default  26.0000   
...                     ...          ...          ...      ...      ...   
497     microbial_community   197.300063  1224.686077   Topic6  -5.5428   
363        sample_collected   188.884269  1119.032137   Topic6  -5.5864   
182    significantly_higher   191.664251  1278.639230   Topic6  -5.5718   
22        infiltration_rate   197.247244  1550.962068   Topic6  -5.5431   
874   signiﬁcant_difference   187.586777  1438.375566   Topic6  -5.5933   

      loglift  
288   30.0000  
92    29.0000  
1574  28.0000  
61    27.0000  
388   26.0000  
...       ...  
497    0.1222  
363    0.1688  
182    0.0501  
22    -0.1143  
874   -0.0891  

[425 rows x 6 columns], token_table=      Topic      Freq                 Term
term                                      
726       1  0.112533  aboveground_biomass
726       2  0.231956  aboveground_biomass
726       3  0.172244  aboveground_biomass
726       4  0.166503  aboveground_biomass
726       5  0.246884  aboveground_biomass
...     ...       ...                  ...
894       2  0.334950        ﬁeld_capacity
894       3  0.097796        ﬁeld_capacity
894       4  0.251824        ﬁeld_capacity
894       5  0.139359        ﬁeld_capacity
894       6  0.046453        ﬁeld_capacity

[1638 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[6, 5, 4, 3, 1, 2])

out.topic_coordinates

# figure similar to Padarian and LDAvis
fig = plt.figure(figsize=(12, 6))

# topic on pca axes
df = out.topic_coordinates
ax = plt.subplot2grid((num_topics//2, 4), (0, 0), rowspan=num_topics//2, colspan=2)
ax.spines['right'].set_position('zero')
ax.spines['top'].set_position('zero')
ax.spines['left'].set_color('none')
ax.spines['bottom'].set_color('none')
ax.spines['right'].set_color('lightgrey')
ax.spines['top'].set_color('lightgrey')
ax.set_xticks([])
ax.set_yticks([])
scatter = ax.scatter(df['x'], df['y'], s=df['Freq']*100, alpha=0.5)
for i in range(df.shape[0]):
    ax.text(df.loc[i, 'x'], df.loc[i, 'y'], str(df.loc[i, 'topics']), ha='center', va='center')
# produce a legend with a cross section of sizes from the scatter
handles, labels = scatter.legend_elements(prop="sizes", num=1, alpha=0.3, fmt='{x:.0f}%', func=lambda x: x/100)
legend2 = ax.legend(handles, labels, title="Marginal topic frequency")
ax.set_xlabel('PC 1')
ax.set_ylabel('PC 2')
ax.axhline(0, color='lightgrey')
ax.axvline(0, color='lightgrey')
a = 0.05
ax.set_xlim([-a, a])
ax.set_ylim([-a, a])

# word in topic
df = out.topic_info
topn = 6
xlim = [0, None]
for i in range(num_topics):
    category = 'Topic' + str(i+1)
    tdf = df[df['Category'].eq(category)].sort_values('Freq', ascending=False).reset_index(drop=False)
    ax = plt.subplot2grid((num_topics//2, 4), (i%3, 2 + i//3)) 
    ax.set_title('Topic {:d} (C$_V$={:.2f})'.format(i+1, top_topics[i][1]))
    ax.barh(np.arange(topn), tdf['Total'].values[:topn])
    ax.barh(np.arange(topn), tdf['Freq'].values[:topn])
    terms = tdf['Term'].str.replace('_', ' ').values[:topn].tolist()
    ax.set_yticks(np.arange(len(terms)))
    ax.set_yticklabels(terms)
    ax.set_xlim(xlim)
    ax.invert_yaxis()
    ax.set_xlabel('Term frequency')
fig.tight_layout()
fig.savefig(outputdir + 'topics-meta.jpg', dpi=300)

# try multiple number of topics
avgcoh = []
for num_topics in tqdm([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30, 40]):
    print('modelling {:d} topics'.format(num_topics))
    model = LdaMulticore(
        workers=4,
        corpus=corpus,
        id2word=id2word,
        chunksize=1000,
        #alpha='auto',
        eta='auto',
        iterations=100,
        num_topics=num_topics,
        passes=10,
        eval_every=None  # don't evaluate model perplexity, takes too much time.
    )
    
   # top_topics = model.top_topics(corpus=corpus, coherence='u_mass', topn=5)
    top_topics = model.top_topics(texts=docs, coherence='c_v', dictionary=dictionary, topn=5)
    coherences = [t[1] for t in top_topics]
    avgcoh.append([num_topics,
                   np.mean(coherences),
                   np.std(coherences)/np.sqrt(len(coherences))])

# figure
avgcoh = np.array(avgcoh)
fig, ax = plt.subplots()
ax.errorbar(avgcoh[:, 0], avgcoh[:, 1],# yerr=avgcoh[:, 2],
            marker='.', linestyle='-')
ax.set_xlabel('Number of topics')
ax.set_ylabel('Coherence')
fig.savefig(outputdir + 'coherence-meta.jpg', dpi=300)

C:\Users\gblanchy\WPy64-3890\python-3.8.9.amd64\lib\site-packages\ipykernel\ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
  and should_run_async(code)

Detecting relationships¶

By looking at sentence that contains a 'driver' and a 'property', we try to find the relationship between the two.

Replacing abbreviation and detecting coreference¶

Coreference consists in replace a pronoun with the entity it is refering to. In the case observed in the abstract, we did not found it to be useful.

# replace abbreviation
def replaceAbbreviation(text):
    if pd.isna(text) is False:
        abrvs = [a for a in re.findall(r'(\([A-Z]*\))', text) if len(a) > 3]
        for abrv in abrvs:
            meaning = ' '.join(re.split(' |\\n', text.split(abrv)[0])[-(len(abrv)-1):])
            text = text.replace(abrv, '')
            text = text.replace(abrv[1:-1], meaning)
    return text


replaceAbbreviation('the soil organic carbon (SOC) is generally well used. SOC is dependent on ')
dfpub['abstract'] = dfpub['abstract'].apply(replaceAbbreviation)
dfpub['abstract-web'] = dfpub['abstract-web'].apply(replaceAbbreviation)

# replacing et al. by et al (without dots) to prevent it being as sentence period
def replaceEtAl(text):
    if pd.isna(text) is False:
        return text.replace('et al.', 'et al')
dfpub['abstract'] = dfpub['abstract'].apply(replaceEtAl)
dfpub['abstract-web'] = dfpub['abstract-web'].apply(replaceEtAl)

Example¶

# example for paper
text = "In the short term, tillage operations significantly increased K"# for the entire range of pressure head applied."
sent = nlp(text)
displacy.render(sent, style='dep', options={'compact': True, 'distance': 90})

# build dependency graph and show shortest dependency path
fig, ax = plt.subplots()
node_size = 1500

# find shortest dependency path (SDP)
edges = []
for token in sent:
    for child in token.children:
        edges.append((token.i, child.i))
G = nx.Graph(edges)
sdp = nx.shortest_path(G, 
                       source=5,
                       target=9)

# draw network
pos = nx.spring_layout(G)
nx.draw_networkx_edges(G, pos, ax=ax, alpha=0.5)
nx.draw_networkx_nodes(G, pos, alpha=1, ax=ax, node_color='orange', node_size=node_size)
nx.draw_networkx_labels(G, pos, labels=dict(zip(np.arange(len(sent)), [a for a in sent])),
                        font_family="sans-serif", ax=ax, font_size=12)

# draw sdp
path_edges = list(zip(sdp, sdp[1:]))
nx.draw_networkx_nodes(G, pos, nodelist=sdp[:1], node_color='lightblue', node_size=node_size)
nx.draw_networkx_nodes(G, pos, nodelist=sdp[-1:], node_color='lime', node_size=node_size)
nx.draw_networkx_edges(G, pos, edgelist=path_edges, edge_color='r', width=5)

plt.axis("off")
ax.set_xlim([-1.2, 1.2])
ax.set_ylim([-1.2, 1.2])
fig.tight_layout()
fig.savefig(outputdir + 'sdp.jpg', dpi=300)

Identifying relevant sentences¶

Namely, a sentence which contains a relationships between a 'driver' and a 'property'.

# identify drivers and practices (could make this more complex by using the mural's keywords)
drivers = ['tillage', 'cover crop', 'catch crop', 'residue', 'fertilizer', 'manure', 'liming', 
           'compost', 'traffic', 'compaction', 'biochar', 'irrigation', 'intercropping', 'agroforestry']
variables = [
    'hydraulic conductivity', 'water retention', 'available water', 'runoff',
    'infiltration', 'bulk density', 'macroporosity', 'penetration resistance',
    'soil strength', 'aggregate stability', 'aggregation', 'yield', 'organic matter',
    'organic carbon', 'microbial activity', 'microbial biomass', 'faunal activity',
    'faunal biomass', 'earthworm biomass', 'earthworm activity', 'root biomass',
    'root depth', 'root growth',
    # below are added from abstracts (so in in original search query)
    'rainwater penetration', 'K', 'Ks', 'K(h)', 'K0', 'infiltration rate' 
]

# building patterns
driverMatcher = PhraseMatcher(nlp.vocab, attr='LEMMA')
for driver in drivers:
    driverMatcher.add(camelCase(driver), [nlp(driver)])

variableMatcher = PhraseMatcher(nlp.vocab, attr='LEMMA')
for variable in variables:
    variableMatcher.add(camelCase(variable), [nlp(variable)])

patterns = [
    [
        {'POS': 'AUX', 'OP': '+'},
        {'POS': 'ADV', 'OP': '?'},
        {'POS': 'ADJ', 'OP': '*'}],
    [
        {'POS': 'VERB', 'OP': '+'},
        {'POS': 'ADV', 'OP': '?'},
        {'POS': 'ADJ', 'OP': '?'},
        {'POS': 'NOUN', 'OP': '?'}],
]
verbMatcher = Matcher(nlp.vocab)
verbMatcher.add('be have', patterns)

# look for sentence with the two keywords
dfmatch = pd.DataFrame(columns=['pubid', 'driver', 'variable', 'sentence',
                                'driverChunk', 'driverVerb', 'variableChunk',
                                'variableVerb', 'negation', 'sdp', 'relationship'])
for i, abstract in tqdm(enumerate(dfpub['abstract-web']), total=dfpub.shape[0], leave=True):
    if pd.isna(abstract) is False:
        doc = nlp(abstract)
        # looking for sentence containing driver and variable
        for sent in doc.sents:
            driverMatches = []
            variableMatches = []
            # looking inside each name chunk for keywords
            for chunk in sent.noun_chunks:
                matches = driverMatcher(chunk)
                if len(matches) > 0:
                    driverMatches.append(
                        (nlp.vocab.strings[matches[0][0]], chunk))
                matches = variableMatcher(chunk)
                if len(matches) > 0:
                    variableMatches.append(
                        (nlp.vocab.strings[matches[0][0]], chunk))
            # check if negation
            negs = [token for token in sent if token.dep_ == 'neg']
            if len(driverMatches) > 0 and len(variableMatches) > 0:
                # building relationship between drivers and variables found
                for driverMatch in driverMatches:
                    for variableMatch in variableMatches:
                        # attempting to get the verb linking the two chunk
                        # in a recursive way
                        if driverMatch[1].root.head.pos_ == 'VERB':
                            verb1 = driverMatch[1].root.head.text
                        elif driverMatch[1].root.head.head.pos_ == 'VERB':
                            verb1 = driverMatch[1].root.head.head.text
                        elif driverMatch[1].root.head.head.head.pos_ == 'VERB':
                            verb1 = driverMatch[1].root.head.head.head.text
                        elif driverMatch[1].root.head.head.head.head.pos_ == 'VERB':
                            verb1 = driverMatch[1].root.head.head.head.head.text
                        else:
                            verb2 = None
                        if variableMatch[1].root.head.pos_ == 'VERB':
                            verb2 = variableMatch[1].root.head.text
                        elif variableMatch[1].root.head.head.pos_ == 'VERB':
                            verb2 = variableMatch[1].root.head.head.text
                        elif variableMatch[1].root.head.head.head.pos_ == 'VERB':
                            verb2 = variableMatch[1].root.head.head.head.text
                        elif variableMatch[1].root.head.head.head.head.pos_ == 'VERB':
                            verb2 = variableMatch[1].root.head.head.head.head.text
                        else:
                            verb2 = None
                        # find shortest dependency path (SDP)
                        edges = []
                        for token in sent:
                            for child in token.children:
                                edges.append((token.i, child.i))
                        graph = nx.Graph(edges)
                        sdp = nx.shortest_path(graph, 
                                               source=driverMatch[1].root.i,
                                               target=variableMatch[1].root.i)
                        i2t = dict(zip([token.i for token in sent],
                                       [token for token in sent]))
                        #sdpVerb = [i2t[a].lower_ for a in sdp if i2t[a].pos_ == 'VERB']
                        
                        # replace driver, variable and verb by more informative chunks
                        sdpPath = ', '.join([i2t[a].lower_ for a in sdp])
                        sdpPath = sdpPath.replace(driverMatch[1].root.text,
                                                  driverMatch[1].text.lower())
                        sdpPath = sdpPath.replace(variableMatch[1].root.text,
                                                  variableMatch[1].text.lower())
                        verbMatches = verbMatcher(sent)
                        verbSpans = [sent[start:end] for _, start, end in verbMatches]
                        sdpVerb = ''
                        sdpVerbLong = ''
                        for a in sdp:
                            token = i2t[a]
                            if (token.pos_ == 'AUX') | (token.pos_ == 'VERB'):
                                sdpVerb = token.text
                                sdpVerbLong = token.text
                                for verbSpan in verbSpans:
                                    if ((token in verbSpan) 
                                        & (len(sdpVerbLong) < len(verbSpan.text))):
                                        sdpVerbLong = verbSpan.text
                        sdpPath = sdpPath.replace(sdpVerb, sdpVerbLong)

                        dic = {
                            'pubid': i,
                            'driver': driverMatch[0],
                            'variable': variableMatch[0],
                            'sentence': sent.text.strip().replace('\n',' '),
                            'driverChunk': driverMatch[1].text.strip().replace('\n', ' '),
                            'driverVerb': verb1,
                            'variableChunk': variableMatch[1].text.strip().replace('\n', ' '),
                            'variableVerb': verb2,
                            'sdp': sdpPath,
                            #'sdp_old': ', '.join([i2t[a].lower_ for a in sdp]),
                            'sdpVerb': sdpVerbLong,
                            'negation': True if len(negs) % 2 != 0 else False,
                        }
                        dfmatch = dfmatch.append(dic, ignore_index=True)
# drop duplicates
print(dfmatch.shape, '-> ', end='')
dfmatch = dfmatch.drop_duplicates().reset_index(drop=True)
print(dfmatch.shape, 'after duplicates drop')

print('{:d} relationship detected (among {:d} documents)'.format(
    dfmatch.shape[0], dfpub.shape[0]))
dfmatch.sort_values('sentence').to_excel(datadir + 'dfmatch-web.xlsx', index=False)

Assessing the correlation of relationships¶

Possible sentiments:

+1: the two entities are positively correlated, when the driver increases, the property increases (e.g. biochar incrases hydraulic conductivity)
-1: the two entities are negativaly correlated, when the driver increases, the property decreases (e.g. reduced tillage increase soil hydraulic conductivity)
0: there is not effect, neither positive or negative (e.g. the application of biochar had no significant effect on soil hydraulic conductivity)
2: has been studied but no conclusion (e.g. the effect of biochar on soil hydraulic conductivity were investigated)
NA: not relevant, unclassified

# load dataframe with manually assessed sentence
dfmatch = pd.read_excel(datadir + 'dfmatch-web-labels.xlsx')

# drivers and predictors
fig, ax = plt.subplots()
dfmatch['driver'].value_counts().plot(kind='bar', ax=ax)
ax.set_ylabel('Count')
ax.set_xlabel('Drivers');

# show number of sentences identified
dft = dfmatch[['driver', 'variable']].copy()
dft['one'] = 1
dft = dft.pivot_table(values='one', columns='driver', index='variable', aggfunc=np.sum, fill_value=0)
dft = dft.replace(0, np.nan)
fig, ax = plt.subplots()
cax = ax.imshow(dft.values.T, vmax=50)
ax.set_yticks(np.arange(dft.shape[1]))
ax.set_yticklabels(dft.columns)
ax.set_ylabel('Drivers')
ax.set_xticks(np.arange(dft.shape[0]))
ax.set_xticklabels(dft.index, rotation=90)
ax.set_xlabel('Variables')
fig.colorbar(cax, ax=ax, label='Count')
fig.tight_layout()
fig.savefig(outputdir + 'rel-identified-meta.jpg', dpi=300)

# matcher for sentiments
posVerb = ['enhance', 'increase', 'improve', 'mitigate',
           'have significant increase', 'have significant effect',
           'have greater effect', 'be higher', 'can make a valuable contribution']
negVerb = ['decrease', 'reduce', 'prevent', 'be lower', 
           'have significant decrease']
neuVerb = ['have no effect', 'have no signficant effect', 'have minimal effect', 
           'be relatively small', 'whatever', 'affect']
posMatcher = PhraseMatcher(nlp.vocab, attr='LEMMA')
for a in posVerb:
    posMatcher.add(camelCase(a), [nlp(a)])
negMatcher = PhraseMatcher(nlp.vocab, attr='LEMMA')
for a in negVerb:
    negMatcher.add(camelCase(a), [nlp(a)])
neuMatcher = PhraseMatcher(nlp.vocab, attr='LEMMA')
for a in neuVerb:
    neuMatcher.add(camelCase(a), [nlp(a)])

studyWords = ['study', 'analyzed', 'analysed', 'measured', 'hypothesize', 'determine',
              'assessed', 'reported', 'be done', 'be conducted', 'quantify', 'compare']
studyMatcher = PhraseMatcher(nlp.vocab, attr='LEMMA')
studyMatcher.add('study', [nlp(a) for a in studyWords])
studyWords2 = ['objective', 'purpose', 'study']
studyMatcher2 = PhraseMatcher(nlp.vocab, attr='LEMMA')
studyMatcher2.add('study', [nlp(a) for a in studyWords2])

# was higher (be + modifiers), had a greater effect
# had higher hydraulic conductivity, lower, decreasing of, improvement in

# assessing sentiment on shortest dependency path (sdp)
dfmatch['sdpRel'] = np.nan
modifiers = ['less', 'conservation', 'no-', 'no', '(non-inversion)', 'non-inversion',
             'reduced', 'minimum', 'decreasing of', 'decreasing in']
for i, sdp in tqdm(enumerate(dfmatch['sdp']), total=dfmatch.shape[0]):
    # identify modifier
    a = 1
    driverChunk = dfmatch.loc[i, 'driverChunk']
    if pd.Series(driverChunk.split(' ')).isin(modifiers).sum() > 0:
        a = -1
    b = 1
    variableChunk = dfmatch.loc[i, 'variableChunk']
    if pd.Series(variableChunk.split(' ')).isin(modifiers).sum() > 0:
        b = -1
    c = 1
    if dfmatch.loc[i, 'negation'] == True:
        c = -1
    if len(posMatcher(nlp(sdp))) > 0:
        dfmatch.loc[i, 'sdpRel'] = 1 * a * b * c
    elif len(negMatcher(nlp(sdp))) > 0:
        dfmatch.loc[i, 'sdpRel'] = -1.0 * a * b * c
    elif len(neuMatcher(nlp(sdp))) > 0:
        dfmatch.loc[i, 'sdpRel'] = 0.0
    elif len(studyMatcher(nlp(sdp))) > 0:
        dfmatch.loc[i, 'sdpRel'] = 2.0
    else:
        # these lines below add quite a bit of relationships
        if len(studyMatcher(nlp(dfmatch.loc[i, 'sentence']))) > 0:
            dfmatch.loc[i, 'sdpRel'] = 2.0

print('{:d}/{:d} relationship found'.format(
    dfmatch['sdpRel'].notnull().sum(), dfmatch.shape[0]))
dfmatch.sort_values('sentence').to_excel(datadir + 'dfmatch-nlp.xlsx', index=False)

# figure (double histogram with well identified and not well identified)
fig, axs = plt.subplots(1, 2, sharey=True, figsize=(8, 3))
cats = [-1, 0, 1, 2]
colors = {1: 'tab:orange', 0: 'lightgrey', -1: 'tab:blue', 2: 'tab:green'}
xlabs = ['negative', 'neutral', 'positive', 'study']
isub = dfmatch['relTrue'].isin(cats)

# recall
ax = axs[0]
ax.set_title('(a) Recall\n(how many relevant found from total)')
#ie = dfmatch['relationship'].isna() & dfmatch['relTrue'].isna()
#cax = ax.bar(-2, ie.sum())
#ax.bar(-2, dfmatch['relTrue'].isna().sum(), bottom=ie.sum(),
#       alpha=0.5, color=cax.patches[0].get_facecolor())
scores = []
for cat in cats:
    icommon = dfmatch[isub]['sdpRel'].eq(cat) & dfmatch[isub]['relTrue'].eq(cat)
    irelevant = dfmatch[isub]['relTrue'].eq(cat)
    scores.append(icommon.sum()/irelevant.sum())
    cax = ax.bar(cat, icommon.sum(), color=colors[cat])
    ax.bar(cat, irelevant.sum(),
           alpha=0.5, color=cax.patches[0].get_facecolor())
ax.bar([], [], color='k', alpha=0.5, label='not found')
ax.bar([], [], color='k', alpha=1, label='found')
leg = ax.legend()
for lh, a in zip(leg.legendHandles, [0.5, 1]): 
    lh.set_alpha(a)
    lh.set_color('green')
ax.set_xticks(cats)
ax.set_xticklabels([xlab + '\n{:.2f}'.format(a) for xlab, a in zip(xlabs, scores)])
ax.set_xlabel('Status')
ax.set_ylabel('Number of relationships')

ax = axs[1]
ax.set_title('(b) Precision\n(how many relevant among selected)')
#ie = dfmatch['relationship'].isna() & dfmatch['relTrue'].isna()
#cax = ax.bar(-2, ie.sum())
#ax.bar(-2, dfmatch['relTrue'].isna().sum(), bottom=ie.sum(),
#       alpha=0.5, color=cax.patches[0].get_facecolor())
scores = [] 
for cat in cats:
    icommon = dfmatch[isub]['sdpRel'].eq(cat) & dfmatch[isub]['relTrue'].eq(cat)
    iselected = dfmatch[isub]['sdpRel'].eq(cat)
    scores.append(icommon.sum()/iselected.sum())
    cax = ax.bar(cat, icommon.sum(), color=colors[cat])
    ax.bar(cat, iselected.sum(),
           alpha=0.5, color=cax.patches[0].get_facecolor())
ax.bar([], [], color='k', alpha=0.5, label='not relevant')
ax.bar([], [], color='k', alpha=1, label='relevant')
leg = ax.legend()
for lh, a in zip(leg.legendHandles, [0.5, 1]): 
    lh.set_alpha(a)
    lh.set_color('green')
ax.set_xticks(cats)
ax.set_xticklabels([xlab + '\n{:.2f}'.format(a) for xlab, a in zip(xlabs, scores)])
ax.set_xlabel('Status')
fig.tight_layout()
fig.savefig(outputdir + 'rel-score-bar-meta.jpg', dpi=500)

# figure with matrix relationship between drivers and properties

ie = dfmatch['relTrue'].notnull()  # all not NaN relationships

df = dfmatch[ie].copy().replace(-2, np.nan)
drivers = df['driver'].unique()
variables = df['variable'].unique()
colors = {1: 'tab:orange', 0: 'lightgrey', -1: 'tab:blue', 2: 'tab:green'}
r = 0.3 # could make the radius varies with the number of relationships found

# create legend (for sizes)

fig, axs = plt.subplots(1, 2, figsize=(10, 5), sharex=True, sharey=True)

ax = axs[0]
ax.set_title('(a) Manually labelled')
for i, driver in enumerate(drivers):
    for j, variable in enumerate(variables):
        ie = df['driver'].eq(driver) & df['variable'].eq(variable)
        s = df[ie]['relTrue'].value_counts()
        a = s.sum()
        sdict = (s/a).to_dict()  # in proportion
        if a < 5:
            r = 0.2
        elif a < 10:
            r = 0.3
        else:
            r = 0.5
        theta1 = 0
        for key in [1, 0, -1, 2]:
            if key in sdict:
                val = sdict[key]
                theta2 = theta1 + val*360
                ax.add_patch(mpatches.Wedge((j, i), r, theta1, theta2, ec='none',
                                            color=colors[key]))
                theta1 = theta2
ax.set_yticks(np.arange(len(drivers)))
ax.set_yticklabels(drivers)
ax.set_ylabel('Drivers')
ax.set_xticks(np.arange(len(variables)))
ax.set_xticklabels(variables, rotation=90)
ax.set_xlabel('Variables')
ax.set_aspect('equal')

ax = axs[1]
ax.set_title('(b) NLP recovered')
for i, driver in enumerate(drivers):
    for j, variable in enumerate(variables):
        ie = df['driver'].eq(driver) & df['variable'].eq(variable)
        s = df[ie]['sdpRel'].value_counts()
        a = s.sum()
        sdict = (s/a).to_dict()  # in proportion
        if a < 5:
            r = 0.2
        elif a < 10:
            r = 0.3
        else:
            r = 0.5
        theta1 = 0
        for key in [1, 0, -1, 2]:
            if key in sdict:
                val = sdict[key]
                theta2 = theta1 + val*360
                ax.add_patch(mpatches.Wedge((j, i), r, theta1, theta2, ec='none',
                                            color=colors[key]))
                theta1 = theta2
ax.set_xticks(np.arange(len(variables)))
ax.set_xticklabels(variables, rotation=90)
ax.set_xlabel('Variables')

# add legend
msizes = [0.2, 0.3, 0.5]
labels = ['1-4', '5-9', '>10']
markers = []
for i, size in enumerate(msizes):
    markers.append(plt.scatter([],[], s=size**2*1000, label=labels[i], color='k', alpha=0.5))
ax.add_artist(plt.legend(handles=markers, bbox_to_anchor=(1, 1), title='Nb statements', fontsize=12))

markers = []
markers.append(ax.add_patch(mpatches.Wedge((0,0), 0, 0, 360, color=colors[1], label='positive')))
markers.append(ax.add_patch(mpatches.Wedge((0,0), 0, 0, 360, color=colors[0], label='neutral')))
markers.append(ax.add_patch(mpatches.Wedge((0,0), 0, 0, 360, color=colors[-1], label='negative')))
markers.append(ax.add_patch(mpatches.Wedge((0,0), 0, 0, 360, color=colors[2], label='study')))
ax.legend(handles=markers, bbox_to_anchor=(1, 0.1), title='Relationship', fontsize=12)

ax.set_xlim([-0.5, len(variables)-0.5])
ax.set_ylim([-0.5, len(drivers)-0.5])
ax.set_aspect('equal')

fig.tight_layout()
fig.savefig(outputdir + 'rel-bubbles-meta.jpg', dpi=500)

# scoring

# task 1: identification of relationships
# we would need to manually extract all relationship from abstract for this

# task 2: sentiment of relationships (match (+1/-1) or not (0))
#col = 'relationship'
col = 'sdpRel'
n = dfmatch.shape[0]
tp = (dfmatch[col].notnull() & dfmatch['relTrue'].notnull()).sum()
tn = (dfmatch[col].isna() & dfmatch['relTrue'].isna()).sum()
fp = (dfmatch[col].notnull() & dfmatch['relTrue'].isna()).sum()
fn = (dfmatch[col].isna() & dfmatch['relTrue'].notnull()).sum()
recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1 = 2 * (recall * precision) / (recall + precision)
print('on selection of relevant relationships:')
print('recall: {:.2f}'.format(recall))
print('precision: {:.2f}'.format(precision))
print('f1 score: {:.2f}'.format(f1))

# task 3: correct assignement
ie = dfmatch['relTrue'].notnull() & dfmatch['relTrue'].ne(-2)
match = (dfmatch[ie][col] == dfmatch[ie]['relTrue']).sum()
print('correctly assigned: {:d}/{:d} ({:.0f}%)'.format(
    match, np.sum(ie), match/np.sum(ie)*100))

on selection of relevant relationships:
recall: 0.74
precision: 0.04
f1 score: 0.08
correctly assigned: 83/125 (66%)

# show mistakes
ie1 = dfmatch['relTrue'].isna() & dfmatch['sdpRel'].isna()
ie2 = dfmatch['relTrue'] != dfmatch['sdpRel']
ie2[ie1] = False  # nan != nan always true
dfmatch[ie2][['driver', 'variable', 'sentence', 'driverVerb', 'variableVerb', 'negation', 'sdp', 'sdpRel', 'relTrue']]

References¶

Röder, M., Both, A., & Hinneburg, A. (2015, February). Exploring the Space of Topic Coherence Measures. Proceedings of the Eighth ACM International Conference on Web Search and Data Mining. 10.1145/2684822.2685324

Potential and limitation of innovative research tools for literature synthesis

Supplementary materials Chapter 6

Potential and limitation of innovative research tools for literature synthesis

nlp-otim-db