This module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for summarization tasks using architectures like BART and T5.
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')
path = Path('./')
cnndm_df = pd.read_csv(path/'cnndm_sample.csv'); len(cnndm_df)
cnndm_df.head(2)
pretrained_model_name = "facebook/bart-large-cnn"
task = HF_TASKS_AUTO.Seq2SeqLM
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, task=task)
hf_arch, type(hf_tokenizer), type(hf_config), type(hf_model)
blocks = (HF_Seq2SeqBlock(hf_arch, hf_config, hf_tokenizer, hf_model), noop)
dblock = DataBlock(blocks=blocks,
get_x=ColReader('article'),
get_y=ColReader('highlights'),
splitter=RandomSplitter())
Two lines! Notice we pass in noop
for our targets (e.g. our summaries) because the batch transform will take care of both out inputs and targets.
dls = dblock.dataloaders(cnndm_df, bs=4)
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[0]['labels'].shape, b[1].shape
dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=1000, target_trunc_at=250)
Tests
The purpose of the following tests is to ensure as much as possible, that the core DataBlock code above works for the pretrained summarization models below. These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.
Note: Feel free to modify the code below to test whatever pretrained summarization models you are working with ... and if any of your pretrained summarization models fail, please submit a github issue (or a PR if you'd like to fix it yourself)
[ model_type for model_type in BLURR_MODEL_HELPER.get_models(task='ConditionalGeneration')
if (not model_type.__name__.startswith('TF')) ]
pretrained_model_names = [
'facebook/bart-base',
'facebook/blenderbot_small-90M',
'allenai/led-base-16384',
'google/mt5-small',
'google/pegasus-cnn_dailymail',
't5-small',
'microsoft/prophetnet-large-uncased',
'microsoft/xprophetnet-large-wiki100-cased', # XLMProphetNet
]
path = Path('./')
cnndm_df = pd.read_csv(path/'cnndm_sample.csv')
#hide_output
task = HF_TASKS_AUTO.Seq2SeqLM
bsz = 2
seq_sz = 256
trg_seq_sz = 40
test_results = []
for model_name in pretrained_model_names:
error=None
print(f'=== {model_name} ===\n')
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(model_name, task=task)
print(f'architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n')
# not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
if (hf_tokenizer.pad_token is None):
hf_tokenizer.add_special_tokens({'pad_token': '<pad>'})
hf_config.pad_token_id = hf_tokenizer.get_vocab()['<pad>']
hf_model.resize_token_embeddings(len(hf_tokenizer))
before_batch_tfm = HF_Seq2SeqBeforeBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model,
padding='max_length',
max_length=seq_sz,
max_target_length=trg_seq_sz)
def add_t5_prefix(inp): return f'summarize: {inp}' if (hf_arch == 't5') else inp
blocks = (HF_Seq2SeqBlock(before_batch_tfm=before_batch_tfm), noop)
dblock = DataBlock(blocks=blocks,
get_x=Pipeline([ColReader('article'), add_t5_prefix]),
get_y=ColReader('highlights'),
splitter=RandomSplitter())
dls = dblock.dataloaders(cnndm_df, bs=bsz)
b = dls.one_batch()
try:
print('*** TESTING DataLoaders ***\n')
test_eq(len(b), 2)
test_eq(len(b[0]['input_ids']), bsz)
test_eq(b[0]['input_ids'].shape, torch.Size([bsz, seq_sz]))
test_eq(len(b[1]), bsz)
test_eq(b[1].shape, torch.Size([bsz, trg_seq_sz]))
if (hasattr(hf_tokenizer, 'add_prefix_space') and hf_arch not in ['led']):
test_eq(hf_tokenizer.add_prefix_space, True)
test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, 'PASSED', ''))
dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=1000)
except Exception as err:
test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, 'FAILED', err))