NLP - May 2018 YAP Alternative Evaluation
May 2018 Alternative Evaluation for YAP¶
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from glob import glob
sns.set_context('talk')
sns.set_style("white")
sns.set_palette('Set2', 10)
%matplotlib inline
In [43]:
dev_dir = r'MAY18\yap_dev'
test_dir = r'MAY18\yap_test'
dev_paths = glob(os.path.join(dev_dir, '*inf.conll'))
dev_gold_path = os.path.join(dev_dir, 'dev.hebtb.lgold.conll')
test_paths = glob(os.path.join(test_dir, '*inf.conll'))
test_gold_path = os.path.join(test_dir, 'test.hebtb.lgold.conll')
In [15]:
def make_conll_df(path):
# CoNLL file is tab delimeted with no quoting
# quoting=3 is csv.QUOTE_NONE
df = (pd.read_csv(path, sep='\t', header=None, quoting=3,
names = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'])
# add sentence labels
.assign(sent = lambda x: (x.ID==1).cumsum())
# replace bad root dependency tags
.replace({'DEPREL': {'prd': 'ROOT'}})
)
df = df.merge(df[['ID', 'FORM', 'sent', 'UPOS']].rename(index=str, columns={'FORM': 'head_form', 'UPOS': 'head_upos'}).set_index(['sent', 'ID']),
left_on=['sent', 'HEAD'], right_index=True, how='left')
return df
In [44]:
dev_gold = make_conll_df(dev_gold_path)
dev_dfs = list(map(make_conll_df, dev_paths))
test_gold = make_conll_df(test_gold_path)
test_dfs = list(map(make_conll_df, test_paths))
Evaluate¶
Handwavy pseudo-algorithm
1. points = 0
1. For each sentence
1. g <- set(gold[FORM, UPOS, DEPREL, head_form])
1. t <- set(test[FORM, UPOS, DEPREL, head_form])
1. points += len(g.intersection(t)) / avg(len(gold, test))
Or in words - for each test sentence, get the number of correct tags using set intersection with the gold sentence, and then normalize by the sentence length.
The current normalization uses the avg length of the sentences between g and t
In [26]:
EVAL_COLS = ['FORM', 'UPOS', 'DEPREL', 'head_form']
def score(t, g = gold, columns = EVAL_COLS):
# get sentence number in scalar form
sent = t.iloc[0,10]
# get the correct gold sentence
g = g[g.sent==sent]
# value for normalization
norm = (t.shape[0] + g.shape[0]) / 2
#use pandas index set logic to get the intersection
g = g.set_index(columns)
t = t.set_index(columns)
return len(g.index.intersection(t.index)) / norm
In [45]:
dev_scores = [df.groupby('sent').apply(score, g=dev_gold) for df in dev_dfs]
In [46]:
test_scores = [df.groupby('sent').apply(score, g=test_gold) for df in test_dfs]
Remove punctuation chars and re-evaluate¶
In [29]:
def depunct(df):
new_df = df.fillna({'head_upos': '___', 'head_form': '___'}).copy()
new_df = new_df[~new_df.UPOS.str.startswith('yy')]
new_df.loc[new_df['head_upos'].str.startswith('yy', na=False), 'head_form'] = '___'
return new_df
In [104]:
dev_nop = list(map(depunct, dev_dfs))
dev_gold_nop = depunct(dev_gold)
test_nop = list(map(depunct, test_dfs))
test_gold_nop = depunct(test_gold)
dev_nop_scores = [df.groupby('sent').apply(score, g=dev_gold_nop) for df in dev_nop]
test_nop_scores = [df.groupby('sent').apply(score, g=test_gold_nop) for df in test_nop]
In [47]:
{path: s.mean() for path, s in zip(dev_paths, dev_scores)}
Out[47]:
In [48]:
{path: s.mean() for path, s in zip(test_paths, test_scores)}
Out[48]:
Without Punctuation¶
In [51]:
{path: s.mean() for path, s in zip(dev_paths, dev_nop_scores)}
Out[51]:
In [52]:
{path: s.mean() for path, s in zip(dev_paths, test_nop_scores)}
Out[52]:
Length vs. Score¶
In [92]:
dev_gold_len = dev_gold.groupby('sent').size()
f, axes = plt.subplots(len(dev_paths), 1, figsize=(15,40), sharex=True)
for i, ax in enumerate(f.axes):
sns.regplot(x=dev_gold_len, y=dev_scores[i], ax=ax)
ax.set_xlabel('Gold Sentence Length')
ax.set_ylabel('Score')
ax.set_title(os.path.basename(dev_paths[i]))
f.subplots_adjust(hspace=0.3)
Bivariate KDE Plot¶
In [86]:
f, axes = plt.subplots(len(dev_paths), 1, figsize=(15,40), sharex=True)
for i, ax in enumerate(f.axes):
sns.kdeplot(dev_gold_len[dev_gold_len<60], dev_scores[i][dev_gold_len<60], shade=True, ax=ax)
ax.set_xlabel('Gold Sentence Length')
ax.set_ylabel('Score')
ax.set_title(os.path.basename(dev_paths[i]))
f.subplots_adjust(hspace=0.3)
In [93]:
test_gold_len = test_gold.groupby('sent').size()
f, axes = plt.subplots(len(test_paths), 1, figsize=(15,40), sharex=True)
for i, ax in enumerate(f.axes):
sns.regplot(x=test_gold_len, y=test_scores[i], ax=ax)
ax.set_xlabel('Gold Sentence Length')
ax.set_ylabel('Score')
ax.set_title(os.path.basename(test_paths[i]))
f.subplots_adjust(hspace=0.3)