| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655 |
- ## @package translate
- # Module caffe2.python.models.seq2seq.translate
- from abc import ABCMeta, abstractmethod
- import argparse
- from future.utils import viewitems
- import logging
- import numpy as np
- import sys
- from caffe2.python import core, rnn_cell, workspace
- from caffe2.python.models.seq2seq.beam_search import BeamSearchForwardOnly
- from caffe2.python.models.seq2seq.seq2seq_model_helper import Seq2SeqModelHelper
- import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util
- logger = logging.getLogger(__name__)
- logger.setLevel(logging.INFO)
- logger.addHandler(logging.StreamHandler(sys.stderr))
- def _weighted_sum(model, values, weight, output_name):
- values_weights = zip(values, [weight] * len(values))
- values_weights_flattened = [x for v_w in values_weights for x in v_w]
- return model.net.WeightedSum(
- values_weights_flattened,
- output_name,
- )
- class Seq2SeqModelCaffe2EnsembleDecoderBase(metaclass=ABCMeta):
- @abstractmethod
- def get_model_file(self, model):
- pass
- @abstractmethod
- def get_db_type(self):
- pass
- def build_word_rewards(self, vocab_size, word_reward, unk_reward):
- word_rewards = np.full([vocab_size], word_reward, dtype=np.float32)
- word_rewards[seq2seq_util.PAD_ID] = 0
- word_rewards[seq2seq_util.GO_ID] = 0
- word_rewards[seq2seq_util.EOS_ID] = 0
- word_rewards[seq2seq_util.UNK_ID] = word_reward + unk_reward
- return word_rewards
- def load_models(self):
- db_reader = 'reader'
- for model, scope_name in zip(
- self.models,
- self.decoder_scope_names,
- ):
- params_for_current_model = [
- param
- for param in self.model.GetAllParams()
- if str(param).startswith(scope_name)
- ]
- assert workspace.RunOperatorOnce(core.CreateOperator(
- 'CreateDB',
- [], [db_reader],
- db=self.get_model_file(model),
- db_type=self.get_db_type())
- ), 'Failed to create db {}'.format(self.get_model_file(model))
- assert workspace.RunOperatorOnce(core.CreateOperator(
- 'Load',
- [db_reader],
- params_for_current_model,
- load_all=1,
- add_prefix=scope_name + '/',
- strip_prefix='gpu_0/',
- ))
- logger.info('Model {} is loaded from a checkpoint {}'.format(
- scope_name, self.get_model_file(model)))
- class Seq2SeqModelCaffe2EnsembleDecoder(Seq2SeqModelCaffe2EnsembleDecoderBase):
- def get_model_file(self, model):
- return model['model_file']
- def get_db_type(self):
- return 'minidb'
- def scope(self, scope_name, blob_name):
- return (
- scope_name + '/' + blob_name
- if scope_name is not None
- else blob_name
- )
- def _build_decoder(
- self,
- model,
- step_model,
- model_params,
- scope,
- previous_tokens,
- timestep,
- fake_seq_lengths,
- ):
- attention_type = model_params['attention']
- assert attention_type in ['none', 'regular']
- use_attention = (attention_type != 'none')
- with core.NameScope(scope):
- encoder_embeddings = seq2seq_util.build_embeddings(
- model=model,
- vocab_size=self.source_vocab_size,
- embedding_size=model_params['encoder_embedding_size'],
- name='encoder_embeddings',
- freeze_embeddings=False,
- )
- (
- encoder_outputs,
- weighted_encoder_outputs,
- final_encoder_hidden_states,
- final_encoder_cell_states,
- encoder_units_per_layer,
- ) = seq2seq_util.build_embedding_encoder(
- model=model,
- encoder_params=model_params['encoder_type'],
- num_decoder_layers=len(model_params['decoder_layer_configs']),
- inputs=self.encoder_inputs,
- input_lengths=self.encoder_lengths,
- vocab_size=self.source_vocab_size,
- embeddings=encoder_embeddings,
- embedding_size=model_params['encoder_embedding_size'],
- use_attention=use_attention,
- num_gpus=0,
- forward_only=True,
- scope=scope,
- )
- with core.NameScope(scope):
- if use_attention:
- # [max_source_length, beam_size, encoder_output_dim]
- encoder_outputs = model.net.Tile(
- encoder_outputs,
- 'encoder_outputs_tiled',
- tiles=self.beam_size,
- axis=1,
- )
- if weighted_encoder_outputs is not None:
- weighted_encoder_outputs = model.net.Tile(
- weighted_encoder_outputs,
- 'weighted_encoder_outputs_tiled',
- tiles=self.beam_size,
- axis=1,
- )
- decoder_embeddings = seq2seq_util.build_embeddings(
- model=model,
- vocab_size=self.target_vocab_size,
- embedding_size=model_params['decoder_embedding_size'],
- name='decoder_embeddings',
- freeze_embeddings=False,
- )
- embedded_tokens_t_prev = step_model.net.Gather(
- [decoder_embeddings, previous_tokens],
- 'embedded_tokens_t_prev',
- )
- decoder_cells = []
- decoder_units_per_layer = []
- for i, layer_config in enumerate(model_params['decoder_layer_configs']):
- num_units = layer_config['num_units']
- decoder_units_per_layer.append(num_units)
- if i == 0:
- input_size = model_params['decoder_embedding_size']
- else:
- input_size = (
- model_params['decoder_layer_configs'][i - 1]['num_units']
- )
- cell = rnn_cell.LSTMCell(
- forward_only=True,
- input_size=input_size,
- hidden_size=num_units,
- forget_bias=0.0,
- memory_optimization=False,
- )
- decoder_cells.append(cell)
- with core.NameScope(scope):
- if final_encoder_hidden_states is not None:
- for i in range(len(final_encoder_hidden_states)):
- if final_encoder_hidden_states[i] is not None:
- final_encoder_hidden_states[i] = model.net.Tile(
- final_encoder_hidden_states[i],
- 'final_encoder_hidden_tiled_{}'.format(i),
- tiles=self.beam_size,
- axis=1,
- )
- if final_encoder_cell_states is not None:
- for i in range(len(final_encoder_cell_states)):
- if final_encoder_cell_states[i] is not None:
- final_encoder_cell_states[i] = model.net.Tile(
- final_encoder_cell_states[i],
- 'final_encoder_cell_tiled_{}'.format(i),
- tiles=self.beam_size,
- axis=1,
- )
- initial_states = \
- seq2seq_util.build_initial_rnn_decoder_states(
- model=model,
- encoder_units_per_layer=encoder_units_per_layer,
- decoder_units_per_layer=decoder_units_per_layer,
- final_encoder_hidden_states=final_encoder_hidden_states,
- final_encoder_cell_states=final_encoder_cell_states,
- use_attention=use_attention,
- )
- attention_decoder = seq2seq_util.LSTMWithAttentionDecoder(
- encoder_outputs=encoder_outputs,
- encoder_output_dim=encoder_units_per_layer[-1],
- encoder_lengths=None,
- vocab_size=self.target_vocab_size,
- attention_type=attention_type,
- embedding_size=model_params['decoder_embedding_size'],
- decoder_num_units=decoder_units_per_layer[-1],
- decoder_cells=decoder_cells,
- weighted_encoder_outputs=weighted_encoder_outputs,
- name=scope,
- )
- states_prev = step_model.net.AddExternalInputs(*[
- '{}/{}_prev'.format(scope, s)
- for s in attention_decoder.get_state_names()
- ])
- decoder_outputs, states = attention_decoder.apply(
- model=step_model,
- input_t=embedded_tokens_t_prev,
- seq_lengths=fake_seq_lengths,
- states=states_prev,
- timestep=timestep,
- )
- state_configs = [
- BeamSearchForwardOnly.StateConfig(
- initial_value=initial_state,
- state_prev_link=BeamSearchForwardOnly.LinkConfig(
- blob=state_prev,
- offset=0,
- window=1,
- ),
- state_link=BeamSearchForwardOnly.LinkConfig(
- blob=state,
- offset=1,
- window=1,
- ),
- )
- for initial_state, state_prev, state in zip(
- initial_states,
- states_prev,
- states,
- )
- ]
- with core.NameScope(scope):
- decoder_outputs_flattened, _ = step_model.net.Reshape(
- [decoder_outputs],
- [
- 'decoder_outputs_flattened',
- 'decoder_outputs_and_contexts_combination_old_shape',
- ],
- shape=[-1, attention_decoder.get_output_dim()],
- )
- output_logits = seq2seq_util.output_projection(
- model=step_model,
- decoder_outputs=decoder_outputs_flattened,
- decoder_output_size=attention_decoder.get_output_dim(),
- target_vocab_size=self.target_vocab_size,
- decoder_softmax_size=model_params['decoder_softmax_size'],
- )
- # [1, beam_size, target_vocab_size]
- output_probs = step_model.net.Softmax(
- output_logits,
- 'output_probs',
- )
- output_log_probs = step_model.net.Log(
- output_probs,
- 'output_log_probs',
- )
- if use_attention:
- attention_weights = attention_decoder.get_attention_weights()
- else:
- attention_weights = step_model.net.ConstantFill(
- [self.encoder_inputs],
- 'zero_attention_weights_tmp_1',
- value=0.0,
- )
- attention_weights = step_model.net.Transpose(
- attention_weights,
- 'zero_attention_weights_tmp_2',
- )
- attention_weights = step_model.net.Tile(
- attention_weights,
- 'zero_attention_weights_tmp',
- tiles=self.beam_size,
- axis=0,
- )
- return (
- state_configs,
- output_log_probs,
- attention_weights,
- )
- def __init__(
- self,
- translate_params,
- ):
- self.models = translate_params['ensemble_models']
- decoding_params = translate_params['decoding_params']
- self.beam_size = decoding_params['beam_size']
- assert len(self.models) > 0
- source_vocab = self.models[0]['source_vocab']
- target_vocab = self.models[0]['target_vocab']
- for model in self.models:
- assert model['source_vocab'] == source_vocab
- assert model['target_vocab'] == target_vocab
- self.source_vocab_size = len(source_vocab)
- self.target_vocab_size = len(target_vocab)
- self.decoder_scope_names = [
- 'model{}'.format(i) for i in range(len(self.models))
- ]
- self.model = Seq2SeqModelHelper(init_params=True)
- self.encoder_inputs = self.model.net.AddExternalInput('encoder_inputs')
- self.encoder_lengths = self.model.net.AddExternalInput(
- 'encoder_lengths'
- )
- self.max_output_seq_len = self.model.net.AddExternalInput(
- 'max_output_seq_len'
- )
- fake_seq_lengths = self.model.param_init_net.ConstantFill(
- [],
- 'fake_seq_lengths',
- shape=[self.beam_size],
- value=100000,
- dtype=core.DataType.INT32,
- )
- beam_decoder = BeamSearchForwardOnly(
- beam_size=self.beam_size,
- model=self.model,
- go_token_id=seq2seq_util.GO_ID,
- eos_token_id=seq2seq_util.EOS_ID,
- )
- step_model = beam_decoder.get_step_model()
- state_configs = []
- output_log_probs = []
- attention_weights = []
- for model, scope_name in zip(
- self.models,
- self.decoder_scope_names,
- ):
- (
- state_configs_per_decoder,
- output_log_probs_per_decoder,
- attention_weights_per_decoder,
- ) = self._build_decoder(
- model=self.model,
- step_model=step_model,
- model_params=model['model_params'],
- scope=scope_name,
- previous_tokens=beam_decoder.get_previous_tokens(),
- timestep=beam_decoder.get_timestep(),
- fake_seq_lengths=fake_seq_lengths,
- )
- state_configs.extend(state_configs_per_decoder)
- output_log_probs.append(output_log_probs_per_decoder)
- if attention_weights_per_decoder is not None:
- attention_weights.append(attention_weights_per_decoder)
- assert len(attention_weights) > 0
- num_decoders_with_attention_blob = (
- self.model.param_init_net.ConstantFill(
- [],
- 'num_decoders_with_attention_blob',
- value=1 / float(len(attention_weights)),
- shape=[1],
- )
- )
- # [beam_size, encoder_length, 1]
- attention_weights_average = _weighted_sum(
- model=step_model,
- values=attention_weights,
- weight=num_decoders_with_attention_blob,
- output_name='attention_weights_average',
- )
- num_decoders_blob = self.model.param_init_net.ConstantFill(
- [],
- 'num_decoders_blob',
- value=1 / float(len(output_log_probs)),
- shape=[1],
- )
- # [beam_size, target_vocab_size]
- output_log_probs_average = _weighted_sum(
- model=step_model,
- values=output_log_probs,
- weight=num_decoders_blob,
- output_name='output_log_probs_average',
- )
- word_rewards = self.model.param_init_net.ConstantFill(
- [],
- 'word_rewards',
- shape=[self.target_vocab_size],
- value=0.0,
- dtype=core.DataType.FLOAT,
- )
- (
- self.output_token_beam_list,
- self.output_prev_index_beam_list,
- self.output_score_beam_list,
- self.output_attention_weights_beam_list,
- ) = beam_decoder.apply(
- inputs=self.encoder_inputs,
- length=self.max_output_seq_len,
- log_probs=output_log_probs_average,
- attentions=attention_weights_average,
- state_configs=state_configs,
- data_dependencies=[],
- word_rewards=word_rewards,
- )
- workspace.RunNetOnce(self.model.param_init_net)
- workspace.FeedBlob(
- 'word_rewards',
- self.build_word_rewards(
- vocab_size=self.target_vocab_size,
- word_reward=translate_params['decoding_params']['word_reward'],
- unk_reward=translate_params['decoding_params']['unk_reward'],
- )
- )
- workspace.CreateNet(
- self.model.net,
- input_blobs=[
- str(self.encoder_inputs),
- str(self.encoder_lengths),
- str(self.max_output_seq_len),
- ],
- )
- logger.info('Params created: ')
- for param in self.model.params:
- logger.info(param)
- def decode(self, numberized_input, max_output_seq_len):
- workspace.FeedBlob(
- self.encoder_inputs,
- np.array([
- [token_id] for token_id in reversed(numberized_input)
- ]).astype(dtype=np.int32),
- )
- workspace.FeedBlob(
- self.encoder_lengths,
- np.array([len(numberized_input)]).astype(dtype=np.int32),
- )
- workspace.FeedBlob(
- self.max_output_seq_len,
- np.array([max_output_seq_len]).astype(dtype=np.int64),
- )
- workspace.RunNet(self.model.net)
- num_steps = max_output_seq_len
- score_beam_list = workspace.FetchBlob(self.output_score_beam_list)
- token_beam_list = (
- workspace.FetchBlob(self.output_token_beam_list)
- )
- prev_index_beam_list = (
- workspace.FetchBlob(self.output_prev_index_beam_list)
- )
- attention_weights_beam_list = (
- workspace.FetchBlob(self.output_attention_weights_beam_list)
- )
- best_indices = (num_steps, 0)
- for i in range(num_steps + 1):
- for hyp_index in range(self.beam_size):
- if (
- (
- token_beam_list[i][hyp_index][0] ==
- seq2seq_util.EOS_ID or
- i == num_steps
- ) and
- (
- score_beam_list[i][hyp_index][0] >
- score_beam_list[best_indices[0]][best_indices[1]][0]
- )
- ):
- best_indices = (i, hyp_index)
- i, hyp_index = best_indices
- output = []
- attention_weights_per_token = []
- best_score = -score_beam_list[i][hyp_index][0]
- while i > 0:
- output.append(token_beam_list[i][hyp_index][0])
- attention_weights_per_token.append(
- attention_weights_beam_list[i][hyp_index]
- )
- hyp_index = prev_index_beam_list[i][hyp_index][0]
- i -= 1
- attention_weights_per_token = reversed(attention_weights_per_token)
- # encoder_inputs are reversed, see get_batch func
- attention_weights_per_token = [
- list(reversed(attention_weights))[:len(numberized_input)]
- for attention_weights in attention_weights_per_token
- ]
- output = list(reversed(output))
- return output, attention_weights_per_token, best_score
- def run_seq2seq_beam_decoder(args, model_params, decoding_params):
- source_vocab = seq2seq_util.gen_vocab(
- args.source_corpus,
- args.unk_threshold,
- )
- logger.info('Source vocab size {}'.format(len(source_vocab)))
- target_vocab = seq2seq_util.gen_vocab(
- args.target_corpus,
- args.unk_threshold,
- )
- inversed_target_vocab = {v: k for (k, v) in viewitems(target_vocab)}
- logger.info('Target vocab size {}'.format(len(target_vocab)))
- decoder = Seq2SeqModelCaffe2EnsembleDecoder(
- translate_params=dict(
- ensemble_models=[dict(
- source_vocab=source_vocab,
- target_vocab=target_vocab,
- model_params=model_params,
- model_file=args.checkpoint,
- )],
- decoding_params=decoding_params,
- ),
- )
- decoder.load_models()
- for line in sys.stdin:
- numerized_source_sentence = seq2seq_util.get_numberized_sentence(
- line,
- source_vocab,
- )
- translation, alignment, _ = decoder.decode(
- numerized_source_sentence,
- 2 * len(numerized_source_sentence) + 5,
- )
- print(' '.join([inversed_target_vocab[tid] for tid in translation]))
- def main():
- parser = argparse.ArgumentParser(
- description='Caffe2: Seq2Seq Translation',
- )
- parser.add_argument('--source-corpus', type=str, default=None,
- help='Path to source corpus in a text file format. Each '
- 'line in the file should contain a single sentence',
- required=True)
- parser.add_argument('--target-corpus', type=str, default=None,
- help='Path to target corpus in a text file format',
- required=True)
- parser.add_argument('--unk-threshold', type=int, default=50,
- help='Threshold frequency under which token becomes '
- 'labeled unknown token')
- parser.add_argument('--use-bidirectional-encoder', action='store_true',
- help='Set flag to use bidirectional recurrent network '
- 'in encoder')
- parser.add_argument('--use-attention', action='store_true',
- help='Set flag to use seq2seq with attention model')
- parser.add_argument('--encoder-cell-num-units', type=int, default=512,
- help='Number of cell units per encoder layer')
- parser.add_argument('--encoder-num-layers', type=int, default=2,
- help='Number encoder layers')
- parser.add_argument('--decoder-cell-num-units', type=int, default=512,
- help='Number of cell units in the decoder layer')
- parser.add_argument('--decoder-num-layers', type=int, default=2,
- help='Number decoder layers')
- parser.add_argument('--encoder-embedding-size', type=int, default=256,
- help='Size of embedding in the encoder layer')
- parser.add_argument('--decoder-embedding-size', type=int, default=512,
- help='Size of embedding in the decoder layer')
- parser.add_argument('--decoder-softmax-size', type=int, default=None,
- help='Size of softmax layer in the decoder')
- parser.add_argument('--beam-size', type=int, default=6,
- help='Size of beam for the decoder')
- parser.add_argument('--word-reward', type=float, default=0.0,
- help='Reward per each word generated.')
- parser.add_argument('--unk-reward', type=float, default=0.0,
- help='Reward per each UNK token generated. '
- 'Typically should be negative.')
- parser.add_argument('--checkpoint', type=str, default=None,
- help='Path to checkpoint', required=True)
- args = parser.parse_args()
- encoder_layer_configs = [
- dict(
- num_units=args.encoder_cell_num_units,
- ),
- ] * args.encoder_num_layers
- if args.use_bidirectional_encoder:
- assert args.encoder_cell_num_units % 2 == 0
- encoder_layer_configs[0]['num_units'] /= 2
- decoder_layer_configs = [
- dict(
- num_units=args.decoder_cell_num_units,
- ),
- ] * args.decoder_num_layers
- run_seq2seq_beam_decoder(
- args,
- model_params=dict(
- attention=('regular' if args.use_attention else 'none'),
- decoder_layer_configs=decoder_layer_configs,
- encoder_type=dict(
- encoder_layer_configs=encoder_layer_configs,
- use_bidirectional_encoder=args.use_bidirectional_encoder,
- ),
- encoder_embedding_size=args.encoder_embedding_size,
- decoder_embedding_size=args.decoder_embedding_size,
- decoder_softmax_size=args.decoder_softmax_size,
- ),
- decoding_params=dict(
- beam_size=args.beam_size,
- word_reward=args.word_reward,
- unk_reward=args.unk_reward,
- ),
- )
- if __name__ == '__main__':
- main()
|