Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

may be there is some problem work with tf hub #89

Open
Kiris-tingna opened this issue Jul 14, 2021 · 0 comments
Open

may be there is some problem work with tf hub #89

Kiris-tingna opened this issue Jul 14, 2021 · 0 comments

Comments

@Kiris-tingna
Copy link

hi, I am using this script to generate albert saved model which is capativble with tf serving

since i genrated model , the input is
{
"instances":[
{"inputs": ["你好么"]}
]
}
output result seem not right, actually i want the albert out embedding vector.

{
"predictions": [
[
101,
872,
1962,
720,
102,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
]
]
}

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import bert, os

from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

model_name = "albert_base"
model_dir = bert.fetch_brightmart_albert_model(model_name, ".models")
model_ckpt = os.path.join(model_dir, "albert_model.ckpt")

bert_params = bert.params_from_pretrained_ckpt(model_dir)
l_bert = bert.BertModelLayer.from_params(bert_params, name="bert")


class BertTokenizerLayer(tf.keras.layers.Layer):
    def __init__(self, vocab_file_path, sequence_length=128, lower_case=True):
        super(BertTokenizerLayer, self).__init__()
        self.CLS_ID = tf.constant(101, dtype=tf.int64)
        self.SEP_ID = tf.constant(102, dtype=tf.int64)
        self.PAD_ID = tf.constant(0, dtype=tf.int64)
        self.sequence_length = tf.constant(sequence_length)
        vocab = self.load_vocab(vocab_file_path)
        # These two lines are basically what makes it work
        # assigning the vocab to a tf.Module and then later assigning the
        # intantiated Module to e.g. a Keras Model
        self.create_vocab_table(vocab)
        self.bert_tokenizer = text.BertTokenizer(
            vocab_lookup_table=self.vocab_table,
            token_out_type=tf.int64,
            lower_case=lower_case,
        )

    def load_vocab(self, vocab_file):
        """Loads a vocabulary file into a list."""
        vocab = []
        with tf.io.gfile.GFile(vocab_file, "r") as reader:
            while True:
                token = reader.readline()
                if not token:
                    break
                token = token.strip()
                vocab.append(token)
        return vocab

    def create_vocab_table(self, vocab, num_oov=1):
        vocab_values = tf.range(tf.size(vocab, out_type=tf.int64), dtype=tf.int64)
        self.init = tf.lookup.KeyValueTensorInitializer(
            keys=vocab, values=vocab_values, key_dtype=tf.string, value_dtype=tf.int64
        )
        self.vocab_table = tf.lookup.StaticVocabularyTable(
            self.init, num_oov, lookup_key_dtype=tf.string
        )

    @tf.function
    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        """
        Perform the BERT preprocessing from text -> input token id
        """
        # Convert text into token ids
        tokens = self.bert_tokenizer.tokenize(inputs)

        # Flatten the ragged tensors
        tokens = tokens.merge_dims(1, 2)

        # Add start and end token ids to the id sequence
        start_tokens = tf.fill([tf.shape(inputs)[0], 1], self.CLS_ID)
        end_tokens = tf.fill([tf.shape(inputs)[0], 1], self.SEP_ID)
        tokens = tf.concat([start_tokens, tokens, end_tokens], axis=1)

        # Truncate to sequence length
        tokens = tokens[:, : self.sequence_length]

        # Convert ragged tensor to tensor and pad with PAD_ID
        tokens = tokens.to_tensor(default_value=self.PAD_ID)

        # Pad to sequence length
        pad = self.sequence_length - tf.shape(tokens)[1]
        tokens = tf.pad(tokens, [[0, 0], [0, pad]], constant_values=self.PAD_ID)

        return tf.reshape(tokens, [-1, self.sequence_length])

# text_input = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
# tokenizerd = BertTokenizerLayer(vocab_file_path=os.path.join(model_dir, "vocab.txt"))
# input_tokens = tokenizerd(text_input)
# embed_output = l_bert(input_tokens)    # output: [batch_size, max_seq_len, hidden_size]
# model = tf.keras.Model(inputs=text_input, outputs=embed_output)
# model.save("./models/albert-zh/1", signatures=tokenizerd.call.get_concrete_function(tf.TensorSpec([], tf.string)))


model = tf.keras.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string),
    l_bert
])
model.tokenizer = BertTokenizerLayer(vocab_file_path=os.path.join(model_dir, "vocab.txt"))
model.save("./models/albert-zh/1", signatures=model.tokenizer.call.get_concrete_function(tf.TensorSpec(None, tf.string)))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant