152 lines
7.5 KiB
Python
152 lines
7.5 KiB
Python
import os
|
|
|
|
from cakechat.utils.data_structures import create_namedtuple_instance
|
|
from cakechat.utils.env import is_dev_env
|
|
|
|
MODEL_NAME = 'cakechat_v2.0_keras_tf'
|
|
|
|
INTX = 'uint16' # use unsigined 16-bits int representation for memory efficiency
|
|
RANDOM_SEED = 42 # Fix the random seed to a certain value to make everything reproducible
|
|
|
|
# AWS S3 params
|
|
S3_MODELS_BUCKET_NAME = 'cake-chat-data-v2' # S3 bucket with all the data
|
|
S3_NN_MODEL_REMOTE_DIR = 'nn_models' # S3 remote directory with models itself
|
|
S3_TOKENS_IDX_REMOTE_DIR = 'tokens_index' # S3 remote directory with tokens index
|
|
S3_CONDITIONS_IDX_REMOTE_DIR = 'conditions_index' # S3 remote directory with conditions index
|
|
S3_W2V_REMOTE_DIR = 'w2v_models' # S3 remote directory with pre-trained w2v models
|
|
|
|
# train datasets
|
|
DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data')
|
|
PROCESSED_CORPUS_DIR = os.path.join(DATA_PATH, 'corpora_processed')
|
|
TOKEN_INDEX_DIR = os.path.join(DATA_PATH, 'tokens_index') # Path to prepared tokens index directory
|
|
CONDITION_IDS_INDEX_DIR = os.path.join(DATA_PATH, 'conditions_index') # Path to prepared conditions index directory
|
|
|
|
# train & val data params
|
|
BASE_CORPUS_NAME = 'processed_dialogs' # Basic corpus name prefix
|
|
TRAIN_CORPUS_NAME = 'train_' + BASE_CORPUS_NAME # Training dataset filename prefix
|
|
CONTEXT_SENSITIVE_VAL_CORPUS_NAME = 'val_' + BASE_CORPUS_NAME # Validation dataset filename prefix for intermediate
|
|
CONTEXT_SENSITIVE_TEST_CORPUS_NAME = 'test_' + BASE_CORPUS_NAME # Testing dataset for final metrics calculation
|
|
MAX_VAL_LINES_NUM = 10000 # Max lines number from validation set to be used for metrics calculation
|
|
|
|
# test datasets
|
|
TEST_DATA_DIR = os.path.join(DATA_PATH, 'quality')
|
|
CONTEXT_FREE_VAL_CORPUS_NAME = 'context_free_validation_set' # Context-free validation set path
|
|
TEST_CORPUS_NAME = 'context_free_test_set' # Context-free test set path
|
|
QUESTIONS_CORPUS_NAME = 'context_free_questions' # Context-free questions only path
|
|
|
|
# directory to store model wights and calcualted metrics
|
|
RESULTS_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'results') # Directory to store training results
|
|
TENSORBOARD_LOG_DIR = os.path.join(RESULTS_PATH, 'tensorboard') # Path to tensorboard logs directory
|
|
|
|
# word embeddings params
|
|
USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True # Whether to use word2vec to pre-train weights for the embedding layer
|
|
TRAIN_WORD_EMBEDDINGS_LAYER = True # Allow fine-tuning of the word embedding layer during the model training
|
|
W2V_MODEL_DIR = os.path.join(DATA_PATH, 'w2v_models') # Path to store & load trained word2vec models
|
|
WORD_EMBEDDING_DIMENSION = 128 # word2vec embedding dimension
|
|
W2V_WINDOW_SIZE = 10 # word2vec window size, used during the w2v pre-training
|
|
USE_SKIP_GRAM = True # Use skip-gram word2vec mode. When False, CBOW is used
|
|
TOKEN_REPRESENTATION_SIZE = 256
|
|
MIN_WORD_FREQ = 1 # Minimum frequency of a word to be used in word2vec pre-calculation
|
|
VOCABULARY_MAX_SIZE = 50000 # Maximum vocabulary size in tokens
|
|
MAX_CONDITIONS_NUM = 5 # Maximum conditions num
|
|
|
|
# condition inputs. We use five major emotions to condition our model's predictions
|
|
EMOTIONS_TYPES = create_namedtuple_instance(
|
|
'EMOTIONS_TYPES', neutral='neutral', anger='anger', joy='joy', fear='fear', sadness='sadness')
|
|
DEFAULT_CONDITION = EMOTIONS_TYPES.neutral # Default condition to be used during the prediction (if not specified)
|
|
CONDITION_EMBEDDING_DIMENSION = 128 # Conditions embedding layer dimension to be trained.
|
|
|
|
# NN architecture params
|
|
HIDDEN_LAYER_DIMENSION = 768 # Dimension for the recurrent layer
|
|
DENSE_DROPOUT_RATIO = 0.2 # Use dropout with the given ratio before decoder's output
|
|
USE_CUDNN = bool(os.environ.get('CUDA_VISIBLE_DEVICES')) # True by default for GPU-enable machines (provides ~25% inference
|
|
# speed up) and False on CPU-only machines since they does not support CuDNN
|
|
|
|
# training params
|
|
EPOCHS_NUM = 2 # Total epochs num
|
|
BATCH_SIZE = 196 # Number of samples to be used for gradient estimation on each train step. In case of using multiple
|
|
# GPUs for train, each worker will have this number of samples on each step.
|
|
SHUFFLE_TRAINING_BATCHES = True # Shuffle training batches in the dataset each epoch
|
|
|
|
INPUT_SEQUENCE_LENGTH = 30 # Input sequence length for the model during the training;
|
|
INPUT_CONTEXT_SIZE = 3 # Maximum depth of the conversational history to be used in encoder (at least 1)
|
|
OUTPUT_SEQUENCE_LENGTH = 32 # Output sequence length. Better to keep as INPUT_SEQUENCE_LENGTH+2 for start/end tokens
|
|
|
|
GRAD_CLIP = 5.0 # Gradient clipping param passed to optimizer
|
|
LEARNING_RATE = 6.0 # Learning rate for Adadelta optimzer
|
|
LOG_RUN_METADATA = False # Set 'True' to profile memory consumption and computation time on tensorboard
|
|
AUTOENCODER_MODE = False # Set 'True' to switch seq2seq (x -> y) into autoencoder (x -> x). Used for debugging
|
|
|
|
# predictions params
|
|
MAX_PREDICTIONS_LENGTH = 40 # Max. number of tokens which can be generated on the prediction step
|
|
PREDICTION_MODES = create_namedtuple_instance(
|
|
'PREDICTION_MODES',
|
|
beamsearch='beamsearch',
|
|
beamsearch_reranking='beamsearch_reranking',
|
|
sampling='sampling',
|
|
sampling_reranking='sampling_reranking')
|
|
PREDICTION_MODE_FOR_TESTS = PREDICTION_MODES.sampling # Default prediction mode used in metrics computation
|
|
PREDICTION_DISTINCTNESS_NUM_TOKENS = 50000 # Number of tokens which should be generated to compute distinctness metric
|
|
|
|
# Prediction probabilities modifiers
|
|
REPETITION_PENALIZE_COEFFICIENT = 10.0 # Divide the probabilities of the tokens already have been used during decoding
|
|
NON_PENALIZABLE_TOKENS = ['a', 'an', 'the', '*', '.', ',', '?', '!', '\'', '"', '^', '`'] # Exclude these tokens from
|
|
# repetition penalization modifier
|
|
|
|
# Options for sampling and sampling-reranking options
|
|
DEFAULT_TEMPERATURE = 0.5 # Default softmax temperature used for sampling
|
|
|
|
# Options for beamsearch and sampling-reranking:
|
|
BEAM_SIZE = 10 # Size of the beam (beamsearch only)
|
|
SAMPLES_NUM_FOR_RERANKING = 10 # Number of samples used in reranking (sampling-reranking only)
|
|
MMI_REVERSE_MODEL_SCORE_WEIGHT = 1.0 # Weight for MMI reranking reverse-model score, see the paper:
|
|
# 0.0 - scoring is performing using completely the default model, 1.0 - using completely the reverse model
|
|
|
|
# Logging params
|
|
LOG_CANDIDATES_NUM = 3 # Number of candidates to be printed to output during the logging
|
|
SCREEN_LOG_NUM_TEST_LINES = 10 # Number of first test lines to use when logging outputs on screen
|
|
EVAL_STATE_PER_BATCHES = 500 # How many batches to train until next metrics computed for TensorBoard
|
|
|
|
# Use reduced params values for development
|
|
if is_dev_env():
|
|
# train & val data params
|
|
MAX_VAL_LINES_NUM = 10
|
|
|
|
# word embeddings params
|
|
USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True
|
|
TRAIN_WORD_EMBEDDINGS_LAYER = True
|
|
WORD_EMBEDDING_DIMENSION = 64
|
|
VOCABULARY_MAX_SIZE = 1000
|
|
MAX_CONDITIONS_NUM = 5
|
|
|
|
# condition inputs
|
|
CONDITION_EMBEDDING_DIMENSION = 1
|
|
|
|
# NN architecture params
|
|
HIDDEN_LAYER_DIMENSION = 128
|
|
DENSE_DROPOUT_RATIO = 0.2
|
|
USE_CUDNN = False
|
|
|
|
# training params
|
|
INPUT_SEQUENCE_LENGTH = 3
|
|
INPUT_CONTEXT_SIZE = 1
|
|
OUTPUT_SEQUENCE_LENGTH = 5
|
|
BATCH_SIZE = 4
|
|
SHUFFLE_TRAINING_BATCHES = False
|
|
EPOCHS_NUM = 4
|
|
LEARNING_RATE = 1.0
|
|
LOG_RUN_METADATA = False
|
|
AUTOENCODER_MODE = False
|
|
|
|
# predictions params
|
|
MAX_PREDICTIONS_LENGTH = 4
|
|
|
|
# options for beamsearch and sampling-reranking:
|
|
SAMPLES_NUM_FOR_RERANKING = 5
|
|
BEAM_SIZE = 5
|
|
|
|
# logging params
|
|
LOG_CANDIDATES_NUM = 3
|
|
SCREEN_LOG_NUM_TEST_LINES = 4
|
|
EVAL_STATE_PER_BATCHES = 5
|