import pandas as pd
import numpy as np
import re
pd.set_option('display.max_colwidth', None)
import os, pickle
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

# CUSTOM BEATLES-THEMED COLOR PALETTE
color_codes = ["#D4B504", # (Mean Mr.) Mustard
               "#FF0065", # Strawberry (Fields Forever)
               "#0089FF", # Blue (Jay Way)
               "#FFD700", # Gold(en Slumbers)
               "#C0C0C0", # (Maxwell's) Silver (Hammer)
               "#A36302",] # Penny (Lane)
color_names = ["mr. mustard", "strawberry fields", "blue jay way",
               "golden slumbers", "silver hammer", "penny lane"]
palette = sns.color_palette(color_codes)
sns.set_palette(palette)
sns.palplot(palette)

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))


beatles_df = pd.read_csv('song_data/beatles_era_metadata.csv', sep=',')
beatles_df['reg_song'] = beatles_df['song'].apply(lambda x: re.sub("[\[].*?[\]]", "", x.lower().replace('-',' ')))
beatles_df['reg_song'] = beatles_df['reg_song'].apply(lambda x: re.sub(r"[^A-Za-z0-9 ]+", "", x))
display(beatles_df.head())

BEATLES = {'john','paul','george','ringo'}


writer_counts = beatles_df['songwriter'].value_counts()
beatles_df['songwriter'].apply(
    lambda x: x + ', ' + str(writer_counts[x])
    if type(x) == str
    else x).value_counts()\
.plot.pie(figsize=(12, 12), fontsize=14)
plt.ylabel("")
title = plt.title("Figure 1. Share of song lyrics attributed to each author.", fontsize=20)


beatles_df = beatles_df.loc[beatles_df['songwriter'].isin({b.capitalize() for b in BEATLES})].copy()
counts_df = beatles_df.groupby(['songwriter','year recorded']).size() \
                      .sort_values(ascending=False) \
                      .reset_index(name='count')
fig, ax = plt.subplots(figsize=(15,6))
sns.lineplot(data=counts_df.sort_values('year recorded',ascending=True),
             style='songwriter',hue='songwriter',x='year recorded',y='count',ax=ax)
sns.despine()
ax.set_ylabel("Number of songs",fontsize=16)
ax.set_xlabel("")
plt.setp(ax.get_legend().get_texts(), fontsize='15')
plt.setp(ax.get_legend().get_title(), fontsize='20')
_ = ax.set_title("Figure 2. Share of song lyrics attributed to each Beatle over time",fontsize=20)


save_path = 'song2lyrics.pkl'
song_title2lyrics = pickle.load(open(save_path,'rb'))
lyrics2song_title = dict(zip(song_title2lyrics.values(), song_title2lyrics.keys()))


song_title2dedup_lyrics = pickle.load(open('song2dedup_lyrics.pkl','rb'))


from utils import dedupe_lyrics

deduplicated_rocky = dedupe_lyrics(song_title2lyrics['rocky raccoon'], verbose=True)

now somewhere in the black mountain hills of dakota
there lived a young boy named rocky raccoon
and one day his woman ran off with another guy
hit young rocky in the eye rocky didnt like that
he said im gonna get that boy
so one day he walked into town
booked himself a room in the local saloon
rocky raccoon checked into his room
only to find gideons bible
rocky had come equipped with a gun
to shoot off the legs of his rival
his rival it seems had broken his dreams
by stealing the girl of his fancy
her name was magill and she called herself lil
but everyone knew her as nancy
now she and her man who called himself dan
were in the next room at the hoedown
rocky burst in and grinning a grin
he said danny boy this is a showdown
but daniel was hot he drew first and shot
and rocky collapsed in the corner ah
dda dda dda da da da
dda dda dda da da da
dda dda dda da dda dda dda dda
do do do do do do
ddo ddo ddo do do do
ddo ddo ddo do do do
ddo ddo ddo do do ddo ddo ddo ddo
do do do do do do
now the doctor came in stinking of gin
and proceeded to lie on the table
he said rocky you met your match
and rocky said doc its only a scratch
and ill be better ill be better doc as soon as i am able
and now rocky raccoon he fell back in his room
only to find gideons bible
gideon checked out and he left it no doubt
to help with good rockys revival ah
oh yeah yeah
ddo ddo ddo do do do
ddo ddo ddo do do do
ddo ddo ddo do do ddo ddo ddo ddo
do do do do do do
ddo ddo ddo do do do come on rocky boy
ddo ddo ddo do do do come on rocky boy
ddo ddo ddo do do ddo ddo ddo ddo
the story of rocky there


import spacy
from spacy.tokens import DocBin

nlp = spacy.load("en_core_web_sm")
bytes_data = pickle.load(open('pickled_spacy_docs/bytes_data.pkl','rb'))
doc_bin = DocBin().from_bytes(bytes_data)
docs = list(doc_bin.get_docs(nlp.vocab))
song_title2doc = dict(zip(song_title2dedup_lyrics.keys(), docs))


for tok in song_title2doc['here comes the sun']:
    print(tok.text, tok.lemma_, tok.pos_, tok.head.text, tok.dep_)

here here ADV comes advmod
comes come VERB comes ROOT
the the DET doo det
sun sun PROPN doo compound
doo doo PROPN doo compound
- - PUNCT doo punct
doo doo NOUN doo compound
- - PUNCT doo punct
doo doo NOUN doo compound
- - PUNCT doo punct
doo doo NOUN comes nsubj
here here ADV comes advmod
comes come VERB comes conj
the the DET sun det
sun sun NOUN comes nsubj


 SPACE sun dep
and and CCONJ comes cc
i I PRON say nsubj
say say VERB comes conj
its its PRON little poss
all all PRON right advmod
right right ADJ little amod


 SPACE right dep
little little ADJ darling amod
darling darle VERB been nsubj
its its PRON been nsubj
been be AUX say ccomp
a a DET darling det
long long ADJ winter amod
cold cold ADJ winter amod
lonely lonely ADJ winter amod
winter winter NOUN little compound


 SPACE winter dep
little little ADJ darling amod
darling darling NOUN been attr
it it PRON feels nsubj
feels feel VERB darling relcl
like like ADP feels prep
years year NOUN like pobj
since since SCONJ been mark
its its PRON been nsubj
been be AUX seems advcl
here here ADV been advmod


 SPACE here dep
little little ADJ darling amod
darling darle VERB been npadvmod
the the DET smiles det
smiles smile NOUN darling dobj
returning return VERB smiles acl
to to ADP returning prep
the the DET faces det
faces face NOUN to pobj


 SPACE faces dep
little little ADJ darling amod
darling darling NOUN to pobj
it it PRON seems nsubj
seems seem VERB say ccomp
like like ADP seems prep
years year NOUN like pobj
since since SCONJ been mark
its its PRON been nsubj
been be AUX comes advcl
here here ADV been advmod


 SPACE here dep
sun sun NOUN been attr
here here ADV sun advmod
it it PRON comes nsubj
comes come VERB seems ccomp


 SPACE comes dep
little little ADJ darling amod
darling darling NOUN darling ROOT
i I PRON feel nsubj
feel feel VERB feel ROOT
that that SCONJ melting mark
ice ice NOUN melting nsubj
is be AUX melting aux
slowly slowly ADV melting advmod
melting melt VERB feel ccomp


 SPACE melting dep
little little ADJ darling amod
darling darling NOUN melting dobj
it it PRON seems nsubj
seems seem VERB feel ccomp
like like ADP seems prep
years year NOUN like pobj
since since SCONJ been mark
its its PRON been nsubj
been be AUX seems advcl
clear clear ADJ been acomp


 SPACE clear dep
its its PRON right poss
all all ADV right advmod
right right NOUN right ROOT


total_num_tokens = len([tok for song_title in song_title2doc
                        if song_title in set(beatles_df['reg_song'].values)
                        for tok in song_title2doc[song_title]])
total_num_lemmas = len(set([tok.lemma_  for song_title in song_title2doc
                            if song_title in set(beatles_df['reg_song'].values)
                            for tok in song_title2doc[song_title]]))
print(total_num_tokens, total_num_lemmas,
      len(set(song_title2doc.keys()).intersection(set(beatles_df['reg_song'].values))))

23379 1819 164


# Create train, eval data splits

## Convert names into integers for classification data; 
beatle2int_label = {b: i for i, b in enumerate(BEATLES)}
print('Conversion scheme:', beatle2int_label)
int_label2beatle = {i: b for i, b in enumerate(BEATLES)}

song_title2author = dict(zip(beatles_df['reg_song'].values,
                             beatles_df['songwriter'].apply(
                                 lambda x: x.lower() if type(x) == str else x).values))
songs_without_lyrics = [song for song in beatles_df['reg_song'].values if song not in song_title2lyrics]

## Exclude songs with multiple writers;
combined_texts_with_labels = [(song_title2lyrics[song], song_title2author[song])
                              for song in beatles_df['reg_song'].values
                              if song in song_title2lyrics
                              and song_title2author[song] in {'john','paul','george'}]
print(f"Assembled {len(combined_texts_with_labels)} (lyrics, label) datapoints to use for train/eval data.\n")
print("Excluding the following songs without lyrics:", songs_without_lyrics)

RANDOM_SEED = 1969
np.random.seed(RANDOM_SEED)
train_n = int(len(combined_texts_with_labels)*0.8)
eval_n = len(combined_texts_with_labels) - train_n
print(f"Creating train/eval splits of size {train_n}, {eval_n}, respectively.")
train_ixs = set(np.random.choice(a=list(range(len(combined_texts_with_labels))), size=train_n, replace=False))
eval_ixs = set(range(len(combined_texts_with_labels))).difference(train_ixs)
assert train_ixs.intersection(eval_ixs) == set()

train_texts_with_labels = [combined_texts_with_labels[ix] for ix in train_ixs]
eval_texts_with_labels = [combined_texts_with_labels[ix] for ix in eval_ixs]
print(f"\nDistribution of train labels:", Counter([item[1] for item in train_texts_with_labels]))
print(f"Distribution of eval labels:", Counter([item[1] for item in eval_texts_with_labels]))

train_texts = [item[0] for item in train_texts_with_labels]
train_labels = [item[1] for item in train_texts_with_labels]

eval_texts = [item[0] for item in eval_texts_with_labels]
eval_labels = [item[1] for item in eval_texts_with_labels]

Conversion scheme: {'paul': 0, 'ringo': 1, 'george': 2, 'john': 3}
Assembled 162 (lyrics, label) datapoints to use for train/eval data.

Excluding the following songs without lyrics: []
Creating train/eval splits of size 129, 33, respectively.

Distribution of train labels: Counter({'john': 56, 'paul': 55, 'george': 18})
Distribution of eval labels: Counter({'john': 17, 'paul': 13, 'george': 3})


# Logistic regression w/ different combinations of features

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion
import warnings
warnings.filterwarnings("ignore")

from utils import *

ngram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,4), min_df=5)
tfidf_vectorizer = TfidfVectorizer(min_df=5)
combined_features = FeatureUnion([("ngram", ngram_vectorizer),
                                  ("tfidf", tfidf_vectorizer),
                                  ("ttr", TTRExtractor()),
                                  ("first.sg", FirstSgPronounExtractor()),
                                  ("first.pl", FirstPlPronounExtractor()),
                                  ("second", SecondPronounExtractor()),
                                  ("she", ShePronounExtractor()),
                                  ("he", HePronounExtractor()),
                                  ("it", ItPronounExtractor()),
                                  ("they", TheyPronounExtractor()),
                                  ("valence", ValenceExtractor()),
                                  ("arousal", ArousalExtractor()),
                                  ("dominance", DominanceExtractor()),
                                  ("neg", NegationExtractor()),
                                  ("total_lines", TotalLinesExtractor()),
                                  ("total_words", TotalWordsExtractor()),
                                  ("total_lemmas", TotalLemmasExtractor()),
                                  ("mean_words", MeanWordsPerLineExtractor()),
                                  ("mean_chars", MeanCharsPerWordExtractor()),
                                 ])

# Use combined features to transform dataset:
print(f'Extracting features...')
X_train = combined_features.fit_transform(train_texts)
X_test = combined_features.transform(eval_texts)
print(f'\tObtained the following dimensions of train, test data:',X_train.shape, X_test.shape)

lr_model = LogisticRegression(max_iter=2000,).fit(X_train, train_labels)
lr_predictions = lr_model.predict(X_test)

print(classification_report(eval_labels, lr_predictions))

Extracting features...
	Obtained the following dimensions of train, test data: (129, 946) (33, 946)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.69      0.53      0.60        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.38      0.38      0.37        33
weighted avg       0.53      0.52      0.51        33


all_feats = [item[0] for item in combined_features.transformer_list]
best_lr_model = lr_model
best_acc = accuracy_score(eval_labels, lr_predictions)
for ablated_feat in all_feats:
    ablated_features = FeatureUnion([feat for feat in combined_features.transformer_list
                                     if feat[0] != ablated_feat])

    print(f'Extracting ablated features without {ablated_feat}...')
    X_train = ablated_features.fit_transform(train_texts)
    X_test = ablated_features.transform(eval_texts)
    print(f'\tObtained the following dimensions of train, test data:',X_train.shape, X_test.shape)

    model_ = LogisticRegression(max_iter=1000).fit(X_train, train_labels)
    predictions = model_.predict(X_test)

    print(classification_report(eval_labels, predictions))
    curr_acc = accuracy_score(eval_labels, predictions)
    if curr_acc > best_acc:
        best_lr_model = model_
        best_feat_set = ablated_features.transformer_list
        best_acc = curr_acc

Extracting ablated features without ngram...
	Obtained the following dimensions of train, test data: (129, 359) (33, 359)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.62      0.47      0.53        17
        paul       0.47      0.54      0.50        13

    accuracy                           0.45        33
   macro avg       0.36      0.34      0.34        33
weighted avg       0.50      0.45      0.47        33

Extracting ablated features without tfidf...
	Obtained the following dimensions of train, test data: (129, 604) (33, 604)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.69      0.53      0.60        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.38      0.38      0.37        33
weighted avg       0.53      0.52      0.51        33

Extracting ablated features without ttr...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.69      0.53      0.60        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.38      0.38      0.37        33
weighted avg       0.53      0.52      0.51        33

Extracting ablated features without first.sg...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.69      0.53      0.60        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.38      0.38      0.37        33
weighted avg       0.53      0.52      0.51        33

Extracting ablated features without first.pl...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.69      0.53      0.60        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.38      0.38      0.37        33
weighted avg       0.53      0.52      0.51        33

Extracting ablated features without second...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.69      0.53      0.60        17
        paul       0.47      0.69      0.56        13

    accuracy                           0.55        33
   macro avg       0.39      0.41      0.39        33
weighted avg       0.54      0.55      0.53        33

Extracting ablated features without she...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.69      0.53      0.60        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.38      0.38      0.37        33
weighted avg       0.53      0.52      0.51        33

Extracting ablated features without he...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.69      0.53      0.60        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.38      0.38      0.37        33
weighted avg       0.53      0.52      0.51        33

Extracting ablated features without it...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.69      0.53      0.60        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.38      0.38      0.37        33
weighted avg       0.53      0.52      0.51        33

Extracting ablated features without they...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.75      0.53      0.62        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.40      0.38      0.38        33
weighted avg       0.56      0.52      0.52        33

Extracting ablated features without valence...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.58      0.41      0.48        17
        paul       0.38      0.62      0.47        13

    accuracy                           0.45        33
   macro avg       0.32      0.34      0.32        33
weighted avg       0.45      0.45      0.43        33

Extracting ablated features without arousal...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.69      0.53      0.60        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.38      0.38      0.37        33
weighted avg       0.53      0.52      0.51        33

Extracting ablated features without dominance...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.67      0.47      0.55        17
        paul       0.42      0.62      0.50        13

    accuracy                           0.48        33
   macro avg       0.36      0.36      0.35        33
weighted avg       0.51      0.48      0.48        33

Extracting ablated features without neg...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.69      0.53      0.60        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.38      0.38      0.37        33
weighted avg       0.53      0.52      0.51        33

Extracting ablated features without total_lines...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.69      0.53      0.60        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.38      0.38      0.37        33
weighted avg       0.53      0.52      0.51        33

Extracting ablated features without total_words...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.73      0.47      0.57        17
        paul       0.47      0.69      0.56        13

    accuracy                           0.52        33
   macro avg       0.40      0.39      0.38        33
weighted avg       0.56      0.52      0.52        33

Extracting ablated features without total_lemmas...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.60      0.35      0.44        17
        paul       0.38      0.62      0.47        13

    accuracy                           0.42        33
   macro avg       0.33      0.32      0.31        33
weighted avg       0.46      0.42      0.41        33

Extracting ablated features without mean_words...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.75      0.53      0.62        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.40      0.38      0.38        33
weighted avg       0.56      0.52      0.52        33

Extracting ablated features without mean_chars...
	Obtained the following dimensions of train, test data: (129, 945) (33, 945)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.69      0.53      0.60        17
        paul       0.44      0.62      0.52        13

    accuracy                           0.52        33
   macro avg       0.38      0.38      0.37        33
weighted avg       0.53      0.52      0.51        33


best_acc, [feat_name for feat_name, feat in best_feat_set]

(0.5454545454545454,
 ['ngram',
  'tfidf',
  'ttr',
  'first.sg',
  'first.pl',
  'she',
  'he',
  'it',
  'they',
  'valence',
  'arousal',
  'dominance',
  'neg',
  'total_lines',
  'total_words',
  'total_lemmas',
  'mean_words',
  'mean_chars'])


ngram_feats = [item[0] for item in sorted(ngram_vectorizer.vocabulary_.items(),
                                          key=lambda x: x[1], reverse=False)]
tfidf_feats = [item[0] for item in sorted(tfidf_vectorizer.vocabulary_.items(),
                                          key=lambda x: x[1], reverse=False)]
feat_name2weight_per_beatle = {}
for label in [0, 1, 2]:
    feat_name2weight_per_beatle[{0:'george',1:'john',2:'paul'}[label]] = dict(zip(ngram_feats + tfidf_feats + \
                            [feat_name.upper() for feat_name, feat in best_feat_set][2:],
                             best_lr_model.coef_[label]))

top_feats_per_beatle = {}
for beatle in feat_name2weight_per_beatle:
    top_feats = sorted(feat_name2weight_per_beatle[beatle].items(), key=lambda x: x[1], reverse=True)[:20]
    top_feats_per_beatle[beatle] = [(item[0], round(item[1], 3)) for item in top_feats]
pd.DataFrame(top_feats_per_beatle)


import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report

model_name = 'distilbert-base-cased'
device_name = 'cuda'
max_length = 512
cached_model_directory_name = 'distilbert-beatles-clf'

# Encode data
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
unique_labels = set(label for label in train_labels)
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
test_encodings  = tokenizer(eval_texts, truncation=True, padding=True, max_length=max_length)

train_labels_encoded = [label2id[y] for y in train_labels]
test_labels_encoded  = [label2id[y] for y in eval_labels]

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(train_encodings, train_labels_encoded)
test_dataset = MyDataset(test_encodings, test_labels_encoded)

bert_model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=len(id2label))\
                                                .to(device_name)
training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    learning_rate=5e-5,              # initial learning rate for Adam optimizer
    warmup_steps=100,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='./results',          # output directory
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
      'accuracy': acc,
    }

trainer = Trainer(
    model=bert_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function 
)

trainer.train()

print(f"Saving model to: {cached_model_directory_name}")
trainer.save_model(cached_model_directory_name)

trainer.evaluate()
predicted_results = trainer.predict(test_dataset)

bert_predicted_labels = predicted_results.predictions.argmax(-1)      # Get the highest probability prediction
bert_predicted_labels = bert_predicted_labels.flatten().tolist()      # Flatten the predictions into a 1D list
bert_predicted_labels = [id2label[l] for l in bert_predicted_labels]  # Convert from integers back to strings for readability

print(classification_report(eval_labels, bert_predicted_labels))

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
***** Running training *****
  Num examples = 129
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 27
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
wandb: Currently logged in as: yiweil (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.13.4 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to distilbert-beatles-clf
Configuration saved in distilbert-beatles-clf/config.json

Saving model to: distilbert-beatles-clf

Model weights saved in distilbert-beatles-clf/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 33
  Batch size = 20

***** Running Prediction *****
  Num examples = 33
  Batch size = 20

              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.00      0.00      0.00        17
        paul       0.39      1.00      0.57        13

    accuracy                           0.39        33
   macro avg       0.13      0.33      0.19        33
weighted avg       0.16      0.39      0.22        33


# majority class baseline
MAJ_CLASS = Counter(train_labels).most_common()[0][0]
maj_class_preds = [MAJ_CLASS] * len(eval_labels)
print(classification_report(eval_labels, maj_class_preds))

              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.52      1.00      0.68        17
        paul       0.00      0.00      0.00        13

    accuracy                           0.52        33
   macro avg       0.17      0.33      0.23        33
weighted avg       0.27      0.52      0.35        33


# random guess with informative prior
prior_probs = [(item[0],item[1]/len(train_labels)) for item in Counter(train_labels).most_common()]
print('Prior label probabilities:',prior_probs)
random_with_prior_preds = []
for i in range(len(eval_labels)):
    guess = np.random.choice([item[0] for item in prior_probs], size=1, replace=False,
                             p=[item[1] for item in prior_probs])[0]
    #print(guess)
    random_with_prior_preds.append(guess)
print(len(random_with_prior_preds))
print('Distribution of guesses with informed prior:',
      [(item[0],item[1]/len(eval_labels)) for item in Counter(random_with_prior_preds).most_common()])
print('\n')
print(classification_report(eval_labels, random_with_prior_preds))

Prior label probabilities: [('john', 0.43410852713178294), ('paul', 0.4263565891472868), ('george', 0.13953488372093023)]
33
Distribution of guesses with informed prior: [('john', 0.5454545454545454), ('paul', 0.36363636363636365), ('george', 0.09090909090909091)]


              precision    recall  f1-score   support

      george       0.33      0.33      0.33         3
        john       0.61      0.65      0.63        17
        paul       0.50      0.46      0.48        13

    accuracy                           0.55        33
   macro avg       0.48      0.48      0.48        33
weighted avg       0.54      0.55      0.54        33


# LR confusion matrix shows a lot of confusion b/w Paul and John, and George is not getting predicted at all

print(f'Extracting best performing combination of features: {[feat[0] for feat in best_feat_set]}...')
X_test = FeatureUnion(best_feat_set).transform(eval_texts)
best_lr_predictions = best_lr_model.predict(X_test)

classifications_dict = defaultdict(int)
for _true_label, _predicted_label in zip(eval_labels, best_lr_predictions):
    classifications_dict[(_true_label, _predicted_label)] += 1

dicts_to_plot = []
for (_true_author, _predicted_author), _count in classifications_dict.items():
    dicts_to_plot.append({'True Author': _true_author,
                        'Predicted Author': _predicted_author,
                        'Number of Classifications': _count})

df_to_plot = pd.DataFrame(dicts_to_plot)
df_wide = df_to_plot.pivot_table(index='True Author',
                                 columns='Predicted Author',
                                 values='Number of Classifications')

plt.figure(figsize=(9,7))
sns.set(style='ticks', font_scale=1.2)
sns.heatmap(df_wide, linewidths=1, cmap='Purples')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

Extracting best performing combination of features: ['ngram', 'tfidf', 'ttr', 'first.sg', 'first.pl', 'she', 'he', 'it', 'they', 'valence', 'arousal', 'dominance', 'neg', 'total_lines', 'total_words', 'total_lemmas', 'mean_words', 'mean_chars']...


comp_df = pd.DataFrame({
    'song':[lyrics2song_title[x] for x in eval_texts],
    'true':eval_labels,
    'predicted':best_lr_predictions
})
comp_df.loc[comp_df['true']!=comp_df['predicted']].sort_values(['true','predicted'])


classifications_dict = defaultdict(int)
for _true_label, _predicted_label in zip(eval_labels, bert_predicted_labels):
    classifications_dict[(_true_label, _predicted_label)] += 1

dicts_to_plot = []
for (_true_author, _predicted_author), _count in classifications_dict.items():
    dicts_to_plot.append({'True Author': _true_author,
                        'Predicted Author': _predicted_author,
                        'Number of Classifications': _count})

df_to_plot = pd.DataFrame(dicts_to_plot)
df_wide = df_to_plot.pivot_table(index='True Author',
                                 columns='Predicted Author',
                                 values='Number of Classifications')

plt.figure(figsize=(9,7))
sns.set(style='ticks', font_scale=1.2)
sns.heatmap(df_wide, linewidths=1, cmap='Purples')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


LAST_NAMES = {
    'john': 'lennon',
    'paul': 'mccartney',
    'george': 'harrison',
    'ringo': 'starr'}

solo_dfs = []
for beatle in BEATLES:
    df_ = pd.read_csv(f'song_data/{LAST_NAMES[beatle]}_solo.csv')
    if df_.shape[1] == 6:
        df_.columns = ['Song', 'Writer(s)', 'Original release', 'Producer(s)', 'Year', 'Ref.']
    else:
        df_.columns = ['Song', 'Writer(s)', 'Original release', 'Year', 'Ref.']
    df_ = df_[['Song','Writer(s)','Original release','Year']]
    solo_dfs.append(df_)
solo_df = pd.concat(solo_dfs)
del solo_dfs, df_
print(solo_df.shape)
display(solo_df.head())

(1103, 4)


def regularize_writer(s):
    s = s.lower().strip()
    s = s.replace('richard','ringo')
    s = s.replace('starkey','starr')
    s = s.replace('[c]','')
    return s.split()[0]

def regularize_song_title(s):
    # only grab the title, between quotes
    split_s = s.split('"')
    s = split_s[1].lower().replace('-',' ')
    s = re.sub(r'''["]+''', "", s)
    s = re.sub("[\[].*?[\]]", "", s)
    s = re.sub(r"[^A-Za-z0-9 ]+", "", s)
    s = s.replace('cloud 9','cloud nine').replace(' link','').replace(' reprise','')
    return s

solo_df['Writer(s)'] = solo_df['Writer(s)'].apply(lambda x: regularize_writer(x))
solo_df = solo_df.loc[solo_df['Writer(s)'].isin(BEATLES)].copy()
solo_df['reg_song'] = solo_df['Song'].apply(lambda x: regularize_song_title(x))
solo_df = solo_df.loc[solo_df['Year'].apply(lambda x: str(x)[0] in {'1','2'})]
solo_df['Year'] = solo_df['Year'].apply(lambda x: int(x))
print(solo_df.shape)

(806, 5)


writer_counts = solo_df['Writer(s)'].value_counts()
solo_df['Writer(s)'].apply(
    lambda x: x.capitalize() + ', ' + str(writer_counts[x])).value_counts()\
.plot.pie(figsize=(5, 5), fontsize=18)
plt.ylabel("")
_ = plt.title("Figure 3. Number of songs written during each Beatle's solo career.", fontsize=20)


counts_df = solo_df.groupby(['Writer(s)','Year']).size() \
                   .sort_values(ascending=False) \
                   .reset_index(name='count')

fig, ax = plt.subplots(figsize=(15,6))
sns.lineplot(data=counts_df.sort_values('Year',ascending=True),
             style='Writer(s)',hue='Writer(s)',x='Year',y='count',ax=ax)
sns.despine()
ax.set_ylabel("Number of songs",fontsize=16)
ax.set_xlabel("")
plt.setp(ax.get_legend().get_texts(), fontsize='15')
plt.setp(ax.get_legend().get_title(), fontsize='20')
_ = ax.set_title("Figure 4. Number of songs written during each Beatle's solo career, over time",fontsize=20)


# Augment training data; retrain and re-evaluate and compare performance/results

song_title2author.update(dict(zip(solo_df['reg_song'].values,
                                  solo_df['Writer(s)'].apply(
                                      lambda x: x.lower() if type(x) == str else x).values)))
songs_without_lyrics = [song for song in solo_df['reg_song'].values if song not in song_title2lyrics]

aug_texts_with_labels = [(song_title2lyrics[song], song_title2author[song])
                          for song in solo_df['reg_song'].values
                          if song in song_title2lyrics
                          and song_title2author[song] in {'john','paul','george'}]
print(f"Assembled {len(aug_texts_with_labels)} (lyrics, label) datapoints to augment train data.\n")
print(f"\tDistribution of augmented train labels:", Counter([item[1] for item in aug_texts_with_labels]))

aug_train_texts_with_labels = train_texts_with_labels + aug_texts_with_labels
print(f"\nDistribution of new train labels:", Counter([item[1] for item in aug_train_texts_with_labels]))
print(f"Distribution of eval labels:", Counter([item[1] for item in eval_texts_with_labels]))

aug_train_texts = [item[0] for item in aug_train_texts_with_labels]
aug_train_labels = [item[1] for item in aug_train_texts_with_labels]

Assembled 446 (lyrics, label) datapoints to augment train data.

	Distribution of augmented train labels: Counter({'paul': 269, 'george': 107, 'john': 70})

Distribution of new train labels: Counter({'paul': 324, 'john': 126, 'george': 125})
Distribution of eval labels: Counter({'john': 17, 'paul': 13, 'george': 3})


# Logistic regression w/ different combinations of features

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion

ngram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,4), min_df=5)
tfidf_vectorizer = TfidfVectorizer(min_df=5)
combined_features = FeatureUnion([("ngram", ngram_vectorizer),
                                  ("tfidf", tfidf_vectorizer),
                                  ("ttr", TTRExtractor()),
                                  ("first.sg", FirstSgPronounExtractor()),
                                  ("first.pl", FirstPlPronounExtractor()),
                                  ("second", SecondPronounExtractor()),
                                  ("she", ShePronounExtractor()),
                                  ("he", HePronounExtractor()),
                                  ("it", ItPronounExtractor()),
                                  ("they", TheyPronounExtractor()),
                                  ("valence", ValenceExtractor()),
                                  ("arousal", ArousalExtractor()),
                                  ("dominance", DominanceExtractor()),
                                  ("neg", NegationExtractor()),
                                  ("total_lines", TotalLinesExtractor()),
                                  ("total_words", TotalWordsExtractor()),
                                  ("total_lemmas", TotalLemmasExtractor()),
                                  ("mean_words", MeanWordsPerLineExtractor()),
                                  ("mean_chars", MeanCharsPerWordExtractor()),
                                 ])

# Use combined features to transform dataset:
print(f'Extracting features...')
X_train = combined_features.fit_transform(aug_train_texts)
X_test = combined_features.transform(eval_texts)
print(f'\tObtained the following dimensions of train, test data:',X_train.shape, X_test.shape)

aug_lr_model = LogisticRegression(max_iter=1000).fit(X_train, aug_train_labels)
aug_lr_predictions = aug_lr_model.predict(X_test)

print(classification_report(eval_labels, aug_lr_predictions))

Extracting features...
	Obtained the following dimensions of train, test data: (575, 3982) (33, 3982)
              precision    recall  f1-score   support

      george       0.25      0.33      0.29         3
        john       0.56      0.29      0.38        17
        paul       0.40      0.62      0.48        13

    accuracy                           0.42        33
   macro avg       0.40      0.41      0.39        33
weighted avg       0.47      0.42      0.42        33


# LR confusion matrix shows a lot of confusion b/w Paul and John, and George is not getting predicted at all

classifications_dict = defaultdict(int)
for _true_label, _predicted_label in zip(eval_labels, aug_lr_predictions):
    classifications_dict[(_true_label, _predicted_label)] += 1

dicts_to_plot = []
for (_true_author, _predicted_author), _count in classifications_dict.items():
    dicts_to_plot.append({'True Author': _true_author,
                        'Predicted Author': _predicted_author,
                        'Number of Classifications': _count})

df_to_plot = pd.DataFrame(dicts_to_plot)
df_wide = df_to_plot.pivot_table(index='True Author',
                                 columns='Predicted Author',
                                 values='Number of Classifications')

plt.figure(figsize=(9,7))
sns.set(style='ticks', font_scale=1.2)
sns.heatmap(df_wide, linewidths=1, cmap='Purples')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


downsample_target = 125
RANDOM_SEED = 1969
np.random.seed(RANDOM_SEED)
paul_data = [item for item in aug_train_texts_with_labels if item[1]=='paul']
nonpaul_data = [item for item in aug_train_texts_with_labels if item[1]!='paul']
downsample_paul_ixs = set(np.random.choice(a=list(range(len(combined_texts_with_labels))),
                                           size=downsample_target, replace=False))
downsample_paul_data = [item for ix,item in enumerate(paul_data) if ix in downsample_paul_ixs]
print(f"Randomly chose {len(downsample_paul_data)} out of {len(paul_data)} \
datapoints total for downsampled train data.")

down_train_texts = [item[0] for item in downsample_paul_data + nonpaul_data]
down_train_labels = [item[1] for item in downsample_paul_data + nonpaul_data]
print(f"\nDistribution of downsampled train labels:", Counter(down_train_labels))
print(f"Distribution of eval labels:", Counter(eval_labels))

Randomly chose 125 out of 324 datapoints total for downsampled train data.

Distribution of downsampled train labels: Counter({'john': 126, 'paul': 125, 'george': 125})
Distribution of eval labels: Counter({'john': 17, 'paul': 13, 'george': 3})


len(train_labels), len(down_train_labels)

(129, 376)


# Logistic regression w/ different combinations of features

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion

ngram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,4), min_df=5)
tfidf_vectorizer = TfidfVectorizer(min_df=5)
combined_features = FeatureUnion([("ngram", ngram_vectorizer),
                                  ("tfidf", tfidf_vectorizer),
                                  ("ttr", TTRExtractor()),
                                  ("first.sg", FirstSgPronounExtractor()),
                                  ("first.pl", FirstPlPronounExtractor()),
                                  ("second", SecondPronounExtractor()),
                                  ("she", ShePronounExtractor()),
                                  ("he", HePronounExtractor()),
                                  ("it", ItPronounExtractor()),
                                  ("they", TheyPronounExtractor()),
                                  ("valence", ValenceExtractor()),
                                  ("arousal", ArousalExtractor()),
                                  ("dominance", DominanceExtractor()),
                                  ("neg", NegationExtractor()),
                                  ("total_lines", TotalLinesExtractor()),
                                  ("total_words", TotalWordsExtractor()),
                                  ("total_lemmas", TotalLemmasExtractor()),
                                  ("mean_words", MeanWordsPerLineExtractor()),
                                  ("mean_chars", MeanCharsPerWordExtractor()),
                                 ])

# Use combined features to transform dataset:
print(f'Extracting features...')
X_train = combined_features.fit_transform(down_train_texts)
X_test = combined_features.transform(eval_texts)
print(f'\tObtained the following dimensions of train, test data:',X_train.shape, X_test.shape)

down_lr_model = LogisticRegression(max_iter=1000).fit(X_train, down_train_labels)
down_lr_predictions = down_lr_model.predict(X_test)

print(classification_report(eval_labels, down_lr_predictions))

Extracting features...
	Obtained the following dimensions of train, test data: (376, 2635) (33, 2635)
              precision    recall  f1-score   support

      george       0.33      0.67      0.44         3
        john       0.58      0.41      0.48        17
        paul       0.40      0.46      0.43        13

    accuracy                           0.45        33
   macro avg       0.44      0.51      0.45        33
weighted avg       0.49      0.45      0.46        33


all_feats = [item[0] for item in combined_features.transformer_list]
best_lr_model = down_lr_model
best_acc = accuracy_score(eval_labels, down_lr_predictions)
for ablated_feat in all_feats:
    ablated_features = FeatureUnion([feat for feat in combined_features.transformer_list
                                     if feat[0] != ablated_feat])

    print(f'Extracting ablated features without {ablated_feat}...')
    X_train = ablated_features.fit_transform(down_train_texts)
    X_test = ablated_features.transform(eval_texts)
    print(f'\tObtained the following dimensions of train, test data:',X_train.shape, X_test.shape)

    model_ = LogisticRegression(max_iter=1000).fit(X_train, down_train_labels)
    predictions = model_.predict(X_test)

    print(classification_report(eval_labels, predictions))
    curr_acc = accuracy_score(eval_labels, predictions)
    if curr_acc > best_acc:
        best_lr_model = model_
        best_feat_set = ablated_features.transformer_list
        best_acc = curr_acc

Extracting ablated features without ngram...
	Obtained the following dimensions of train, test data: (376, 786) (33, 786)
              precision    recall  f1-score   support

      george       0.11      0.33      0.17         3
        john       0.64      0.41      0.50        17
        paul       0.46      0.46      0.46        13

    accuracy                           0.42        33
   macro avg       0.40      0.40      0.38        33
weighted avg       0.52      0.42      0.45        33

Extracting ablated features without tfidf...
	Obtained the following dimensions of train, test data: (376, 1866) (33, 1866)
              precision    recall  f1-score   support

      george       0.29      0.67      0.40         3
        john       0.50      0.35      0.41        17
        paul       0.36      0.38      0.37        13

    accuracy                           0.39        33
   macro avg       0.38      0.47      0.39        33
weighted avg       0.42      0.39      0.40        33

Extracting ablated features without ttr...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.29      0.67      0.40         3
        john       0.55      0.35      0.43        17
        paul       0.40      0.46      0.43        13

    accuracy                           0.42        33
   macro avg       0.41      0.49      0.42        33
weighted avg       0.46      0.42      0.43        33

Extracting ablated features without first.sg...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.25      0.67      0.36         3
        john       0.55      0.35      0.43        17
        paul       0.43      0.46      0.44        13

    accuracy                           0.42        33
   macro avg       0.41      0.49      0.41        33
weighted avg       0.47      0.42      0.43        33

Extracting ablated features without first.pl...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.29      0.67      0.40         3
        john       0.54      0.41      0.47        17
        paul       0.38      0.38      0.38        13

    accuracy                           0.42        33
   macro avg       0.40      0.49      0.42        33
weighted avg       0.45      0.42      0.43        33

Extracting ablated features without second...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.17      0.33      0.22         3
        john       0.57      0.47      0.52        17
        paul       0.46      0.46      0.46        13

    accuracy                           0.45        33
   macro avg       0.40      0.42      0.40        33
weighted avg       0.49      0.45      0.47        33

Extracting ablated features without she...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.33      0.67      0.44         3
        john       0.58      0.41      0.48        17
        paul       0.40      0.46      0.43        13

    accuracy                           0.45        33
   macro avg       0.44      0.51      0.45        33
weighted avg       0.49      0.45      0.46        33

Extracting ablated features without he...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.29      0.67      0.40         3
        john       0.55      0.35      0.43        17
        paul       0.40      0.46      0.43        13

    accuracy                           0.42        33
   macro avg       0.41      0.49      0.42        33
weighted avg       0.46      0.42      0.43        33

Extracting ablated features without it...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.14      0.33      0.20         3
        john       0.58      0.41      0.48        17
        paul       0.43      0.46      0.44        13

    accuracy                           0.42        33
   macro avg       0.38      0.40      0.38        33
weighted avg       0.48      0.42      0.44        33

Extracting ablated features without they...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.29      0.67      0.40         3
        john       0.58      0.41      0.48        17
        paul       0.43      0.46      0.44        13

    accuracy                           0.45        33
   macro avg       0.43      0.51      0.44        33
weighted avg       0.50      0.45      0.46        33

Extracting ablated features without valence...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.33      0.67      0.44         3
        john       0.60      0.35      0.44        17
        paul       0.41      0.54      0.47        13

    accuracy                           0.45        33
   macro avg       0.45      0.52      0.45        33
weighted avg       0.50      0.45      0.45        33

Extracting ablated features without arousal...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.22      0.67      0.33         3
        john       0.60      0.35      0.44        17
        paul       0.43      0.46      0.44        13

    accuracy                           0.42        33
   macro avg       0.42      0.49      0.41        33
weighted avg       0.50      0.42      0.43        33

Extracting ablated features without dominance...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.33      0.67      0.44         3
        john       0.54      0.41      0.47        17
        paul       0.36      0.38      0.37        13

    accuracy                           0.42        33
   macro avg       0.41      0.49      0.43        33
weighted avg       0.45      0.42      0.43        33

Extracting ablated features without neg...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.20      0.33      0.25         3
        john       0.60      0.53      0.56        17
        paul       0.46      0.46      0.46        13

    accuracy                           0.48        33
   macro avg       0.42      0.44      0.42        33
weighted avg       0.51      0.48      0.49        33

Extracting ablated features without total_lines...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.29      0.67      0.40         3
        john       0.57      0.47      0.52        17
        paul       0.42      0.38      0.40        13

    accuracy                           0.45        33
   macro avg       0.42      0.51      0.44        33
weighted avg       0.48      0.45      0.46        33

Extracting ablated features without total_words...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.17      0.33      0.22         3
        john       0.53      0.47      0.50        17
        paul       0.42      0.38      0.40        13

    accuracy                           0.42        33
   macro avg       0.37      0.40      0.37        33
weighted avg       0.45      0.42      0.44        33

Extracting ablated features without total_lemmas...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.20      0.67      0.31         3
        john       0.50      0.29      0.37        17
        paul       0.31      0.31      0.31        13

    accuracy                           0.33        33
   macro avg       0.34      0.42      0.33        33
weighted avg       0.40      0.33      0.34        33

Extracting ablated features without mean_words...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.29      0.67      0.40         3
        john       0.50      0.35      0.41        17
        paul       0.36      0.38      0.37        13

    accuracy                           0.39        33
   macro avg       0.38      0.47      0.39        33
weighted avg       0.42      0.39      0.40        33

Extracting ablated features without mean_chars...
	Obtained the following dimensions of train, test data: (376, 2634) (33, 2634)
              precision    recall  f1-score   support

      george       0.14      0.33      0.20         3
        john       0.50      0.35      0.41        17
        paul       0.43      0.46      0.44        13

    accuracy                           0.39        33
   macro avg       0.36      0.38      0.35        33
weighted avg       0.44      0.39      0.41        33


best_acc, [feat_name for feat_name, feat in best_feat_set]

(0.48484848484848486,
 ['ngram',
  'tfidf',
  'ttr',
  'first.sg',
  'first.pl',
  'second',
  'she',
  'he',
  'it',
  'they',
  'valence',
  'arousal',
  'dominance',
  'total_lines',
  'total_words',
  'total_lemmas',
  'mean_words',
  'mean_chars'])


# LR confusion matrix 
print(f'Extracting best performing combination of features: {[feat[0] for feat in best_feat_set]}...')
X_test = FeatureUnion(best_feat_set).transform(eval_texts)
down_lr_predictions = best_lr_model.predict(X_test)

classifications_dict = defaultdict(int)
for _true_label, _predicted_label in zip(eval_labels, down_lr_predictions):
    classifications_dict[(_true_label, _predicted_label)] += 1

dicts_to_plot = []
for (_true_author, _predicted_author), _count in classifications_dict.items():
    dicts_to_plot.append({'True Author': _true_author,
                        'Predicted Author': _predicted_author,
                        'Number of Classifications': _count})

df_to_plot = pd.DataFrame(dicts_to_plot)
df_wide = df_to_plot.pivot_table(index='True Author',
                                 columns='Predicted Author',
                                 values='Number of Classifications')

plt.figure(figsize=(9,7))
sns.set(style='ticks', font_scale=1.2)
sns.heatmap(df_wide, linewidths=1, cmap='Purples')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

Extracting best performing combination of features: ['ngram', 'tfidf', 'ttr', 'first.sg', 'first.pl', 'second', 'she', 'he', 'it', 'they', 'valence', 'arousal', 'dominance', 'total_lines', 'total_words', 'total_lemmas', 'mean_words', 'mean_chars']...


ngram_feats = [item[0] for item in sorted(ngram_vectorizer.vocabulary_.items(),
                                          key=lambda x: x[1], reverse=False)]
tfidf_feats = [item[0] for item in sorted(tfidf_vectorizer.vocabulary_.items(),
                                          key=lambda x: x[1], reverse=False)]
feat_name2weight_per_beatle = {}
for label in [0, 1, 2]:
    feat_name2weight_per_beatle[{0:'george',1:'john',2:'paul'}[label]] = dict(zip(ngram_feats + tfidf_feats + \
                            [feat_name.upper() for feat_name, feat in best_feat_set][2:],
                             best_lr_model.coef_[label]))

top_feats_per_beatle = {}
for beatle in feat_name2weight_per_beatle:
    top_feats = sorted(feat_name2weight_per_beatle[beatle].items(), key=lambda x: x[1], reverse=True)[:20]
    top_feats_per_beatle[beatle] = [(item[0], round(item[1], 3)) for item in top_feats]
pd.DataFrame(top_feats_per_beatle)


import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report

model_name = 'distilbert-base-cased'
device_name = 'cuda'
max_length = 512
cached_model_directory_name = 'distilbert-beatles-clf-solo-aug-downsample'

# Encode data
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
unique_labels = set(label for label in train_labels)
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
test_encodings  = tokenizer(eval_texts, truncation=True, padding=True, max_length=max_length)

train_labels_encoded = [label2id[y] for y in train_labels]
test_labels_encoded  = [label2id[y] for y in eval_labels]

print(set(train_labels_encoded))
print(set(test_labels_encoded))

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(train_encodings, train_labels_encoded)
test_dataset = MyDataset(test_encodings, test_labels_encoded)

down_bert_model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=len(id2label)).to(device_name)
training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    learning_rate=5e-5,              # initial learning rate for Adam optimizer
    warmup_steps=100,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='./results',          # output directory
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
      'accuracy': acc,
    }

trainer = Trainer(
    model=down_bert_model,               # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function 
)

trainer.train()

print(f"Saving model to: {cached_model_directory_name}")
trainer.save_model(cached_model_directory_name)

trainer.evaluate()

predicted_results = trainer.predict(test_dataset)
print(predicted_results.predictions.shape)

down_bert_predicted_labels = predicted_results.predictions.argmax(-1)           # Get the highest probability prediction
down_bert_predicted_labels = down_bert_predicted_labels.flatten().tolist()      # Flatten the predictions into a 1D list
down_bert_predicted_labels = [id2label[l] for l in down_bert_predicted_labels]  # Convert from integers back to strings for readability

print(classification_report(eval_labels, down_bert_predicted_labels))

loading file https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt from cache at /sailhome/yiweil/.cache/huggingface/transformers/ba377304984dc63e3ede0e23a938bbbf04d5c3835b66d5bb48343aecca188429.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
loading file https://huggingface.co/distilbert-base-cased/resolve/main/tokenizer.json from cache at /sailhome/yiweil/.cache/huggingface/transformers/acb5c2138c1f8c84f074b86dafce3631667fccd6efcb1a7ea1320cf75c386a36.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6
loading file https://huggingface.co/distilbert-base-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilbert-base-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/distilbert-base-cased/resolve/main/tokenizer_config.json from cache at /sailhome/yiweil/.cache/huggingface/transformers/81e970e5e6ec68be12da0f8f3b2f2469c78d579282299a2ea65b4b7441719107.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /sailhome/yiweil/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.18.0",
  "vocab_size": 28996
}

{0, 1, 2}
{0, 1, 2}

loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /sailhome/yiweil/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.18.0",
  "vocab_size": 28996
}

loading weights file https://huggingface.co/distilbert-base-cased/resolve/main/pytorch_model.bin from cache at /sailhome/yiweil/.cache/huggingface/transformers/9c9f39769dba4c5fe379b4bc82973eb01297bd607954621434eb9f1bc85a23a0.06b428c87335c1bb22eae46fdab31c8286efa0aa09e898a7ac42ddf5c3f5dc19
Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 129
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 27
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to distilbert-beatles-clf-solo-aug-downsample
Configuration saved in distilbert-beatles-clf-solo-aug-downsample/config.json

Saving model to: distilbert-beatles-clf-solo-aug-downsample

Model weights saved in distilbert-beatles-clf-solo-aug-downsample/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 33
  Batch size = 20

***** Running Prediction *****
  Num examples = 33
  Batch size = 20

(33, 3)
              precision    recall  f1-score   support

      george       0.00      0.00      0.00         3
        john       0.67      0.12      0.20        17
        paul       0.40      0.92      0.56        13

    accuracy                           0.42        33
   macro avg       0.36      0.35      0.25        33
weighted avg       0.50      0.42      0.32        33


classifications_dict = defaultdict(int)
for _true_label, _predicted_label in zip(eval_labels, down_bert_predicted_labels):
    classifications_dict[(_true_label, _predicted_label)] += 1

dicts_to_plot = []
for (_true_author, _predicted_author), _count in classifications_dict.items():
    dicts_to_plot.append({'True Author': _true_author,
                        'Predicted Author': _predicted_author,
                        'Number of Classifications': _count})

df_to_plot = pd.DataFrame(dicts_to_plot)
df_wide = df_to_plot.pivot_table(index='True Author',
                                 columns='Predicted Author',
                                 values='Number of Classifications')

plt.figure(figsize=(9,7))
sns.set(style='ticks', font_scale=1.2)
sns.heatmap(df_wide, linewidths=1, cmap='Purples')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


vocab_size_df = pd.DataFrame({'song': song_title2dedup_lyrics.keys(),
                              'vocab_size': [len(set([tok.lemma_ for tok in song_title2doc[song]]))
                                             for song in song_title2dedup_lyrics.keys()],
                              'num_tokens': [len([tok for tok in song_title2doc[song]])
                                             for song in song_title2dedup_lyrics.keys()]})
vocab_size_df.describe()


fig, (ax1, ax2) = plt.subplots(1,2,figsize=(16,4),sharey=True)
sns.histplot(x='vocab_size',data=vocab_size_df,ax=ax1)
sns.histplot(x='num_tokens',data=vocab_size_df,ax=ax2)

<matplotlib.axes._subplots.AxesSubplot at 0x7f2a192d4c18>

	song	songwriter	credits	year written	year recorded	album	notes	reg_song
0	I Saw Her Standing There	Paul	McCartney-Lennon	1962	1963	NaN	collaborative	i saw her standing there
1	Misery	John, Paul	McCartney-Lennon	1963	1963	NaN	NaN	misery
2	Ask Me Why	John	McCartney-Lennon	1962	1962	NaN	NaN	ask me why
3	Please Please Me	John	McCartney-Lennon	1962	1962	NaN	NaN	please please me
4	Love Me Do	Paul	McCartney-Lennon	1962	1962	NaN	"Although George Martin did not like the words much, Paul claimed them as his creation."	love me do

	george	john	paul
0	(TOTAL_LEMMAS, 0.198)	(MEAN_WORDS, 0.206)	(VALENCE, 0.234)
1	(THEY, 0.192)	(let me, 0.19)	(what you, 0.177)
2	(NEG, 0.149)	(ll be, 0.161)	(HE, 0.164)
3	(don know, 0.137)	(me so, 0.156)	(SHE, 0.151)
4	(oh oh, 0.089)	(DOMINANCE, 0.15)	(you you, 0.144)
5	(want to, 0.08)	(you know, 0.142)	(on the, 0.143)
6	(you re, 0.067)	(the world, 0.136)	(FIRST.PL, 0.133)
7	(the sun, 0.067)	(the sky, 0.116)	(and in, 0.122)
8	(too much, 0.057)	(AROUSAL, 0.116)	(you won, 0.12)
9	(IT, 0.054)	(that you, 0.108)	(to the, 0.111)
10	(it all, 0.053)	(in your, 0.092)	(love you, 0.099)
11	(don want, 0.052)	(it so, 0.089)	(to get, 0.08)
12	(if you, 0.05)	(do you, 0.089)	(to make, 0.072)
13	(with you, 0.049)	(you ve, 0.089)	(oh oh, 0.071)
14	(ve got, 0.049)	(well you, 0.078)	(to say, 0.067)
15	(yeah yeah yeah, 0.048)	(me down, 0.077)	(back to, 0.066)
16	(be there, 0.047)	(in love with you, 0.077)	(you say, 0.064)
17	(tell you, 0.046)	(love with you, 0.077)	(me the, 0.063)
18	(be the, 0.046)	(yes you, 0.073)	(in my, 0.062)
19	(don want to, 0.044)	(can you, 0.072)	(make it, 0.061)

	Song	Writer(s)	Original release	Year
0	"1882"	Paul McCartneyand Wings	Paul McCartneyLinda McCartney	2018
1	"222"	Paul McCartney	Paul McCartney	2007
2	"3 Legs"	Paul and Linda McCartney	Paul McCartney	1971
3	"4 4 4"	The Fireman	Paul McCartneyYouth	1993
4	"4th of July"	Wings	Paul McCartneyLinda McCartney	2014

	george	john	paul
0	(love you, 0.185)	(the sky, 0.219)	(the one, 0.285)
1	(THEY, 0.16)	(came to, 0.179)	(one you, 0.193)
2	(want to, 0.15)	(do you, 0.166)	(what you, 0.17)
3	(IT, 0.149)	(in your, 0.157)	(VALENCE, 0.158)
4	(your love, 0.142)	(in the sky, 0.156)	(SHE, 0.153)
5	(FIRST.SG, 0.142)	(my little, 0.146)	(you and, 0.152)
6	(with all, 0.141)	(me so, 0.143)	(the night, 0.142)
7	(out the, 0.131)	(your name, 0.142)	(me love, 0.142)
8	(without you, 0.127)	(at night, 0.125)	(with me, 0.128)
9	(you know, 0.122)	(time is, 0.123)	(it now, 0.127)
10	(you all, 0.118)	(our love, 0.115)	(no more, 0.119)
11	(you may, 0.117)	(MEAN_WORDS, 0.115)	(when you, 0.118)
12	(TOTAL_LEMMAS, 0.116)	(to me, 0.112)	(out of, 0.116)
13	(is to, 0.111)	(DOMINANCE, 0.109)	(in my, 0.113)
14	(don know, 0.111)	(and know, 0.108)	(you don, 0.113)
15	(it is, 0.109)	(the world, 0.105)	(for the, 0.111)
16	(got to, 0.107)	(it so, 0.104)	(it you, 0.106)
17	(do what, 0.106)	(so hard, 0.103)	(do it, 0.101)
18	(as it, 0.105)	(that you, 0.098)	(on your, 0.096)
19	(your feet, 0.1)	(let me, 0.098)	(the time, 0.092)

	vocab_size	num_tokens
count	623.000000	623.000000
mean	67.385233	152.611557
std	25.664028	63.356263
min	1.000000	1.000000
25%	51.000000	114.000000
50%	66.000000	147.000000
75%	80.000000	185.000000
max	249.000000	651.000000

Predicting the authorship of Beatles lyrics¶

Assembling the dataset¶

Processing lyrics¶

Training classifiers¶

How well do the classifiers do on the Beatles' solo-career lyrics?¶

Conclusion + other ideas¶

Footnotes¶

Ablated feature	Change in accuracy	Absolute accuracy (with remaining features)
ngram	-10%	0.42
tf-idf	0	0.52
ttr	0	0.52
first.sg	0	0.52
first.pl	0	0.52
second	+3%	0.55
third.sg.f	0	0.52
third.sg.m	0	0.52
third.sg.n	0	0.52
third.pl	0	0.52
valence	-7%	0.45
arousal	0	0.52
dominance	-4%	0.48
negation	0	0.52
total.num.lines	0	0.52
total.num.words	0	0.52
total.num.lemmas	-10%	0.42
mean.words.per.line	0	0.52
mean.chars.per.word	0	0.52

	song	true	predicted
1	its all too much	george	paul
10	i me mine	george	paul
30	long long long	george	paul
5	sun king	john	george
2	come together	john	paul
4	not a second time	john	paul
14	help	john	paul
18	norwegian wood this bird has flown	john	paul
20	run for your life	john	paul
31	cry baby cry	john	paul
32	good night	john	paul
9	carry that weight	paul	john
17	yesterday	paul	john
21	we can work it out	paul	john
22	corridor music	paul	john