Merge branch 'main' into merge

mybigday · May 13, 2024 · c986a8c · c986a8c
2 parents 22cbfe9 + bd31552
commit c986a8c
Show file tree

Hide file tree

Showing 15 changed files with 174 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -318,6 +318,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.

diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet
@@ -53,6 +53,7 @@
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.

diff --git a/docs/source/tutorials/vanilla-js.md b/docs/source/tutorials/vanilla-js.md
@@ -38,7 +38,7 @@ We’re also adding an empty `<div>` container for displaying the image, plus an
 
 </details>
 
-Next, add the following CSS rules in a `style.css` file and and link it to the HTML:
+Next, add the following CSS rules in a `style.css` file and link it to the HTML:
 
 ```css
 html,

diff --git a/examples/tokenizer-playground/package-lock.json b/examples/tokenizer-playground/package-lock.json
diff --git a/examples/tokenizer-playground/package.json b/examples/tokenizer-playground/package.json
@@ -10,7 +10,7 @@
     "preview": "vite preview"
   },
   "dependencies": {
-    "@xenova/transformers": "^2.15.1",
+    "@xenova/transformers": "^2.17.1",
     "react": "^18.2.0",
     "react-dom": "^18.2.0"
   },

diff --git a/examples/tokenizer-playground/src/App.jsx b/examples/tokenizer-playground/src/App.jsx
@@ -1,7 +1,24 @@
 import { useCallback, useEffect, useRef, useState } from 'react'
-import './App.css'
 import { Token } from './components/Token'
+import './App.css'
 
+// Define list of tokenizers and their corresponding human-readable names
+const TOKENIZER_OPTIONS = Object.freeze({
+  'Xenova/gpt-4': 'gpt-4 / gpt-3.5-turbo / text-embedding-ada-002',
+  'Xenova/text-davinci-003': 'text-davinci-003 / text-davinci-002',
+  'Xenova/gpt-3': 'gpt-3',
+  'Xenova/grok-1-tokenizer': 'Grok-1',
+  'Xenova/claude-tokenizer': 'Claude',
+  'Xenova/mistral-tokenizer-v3': 'Mistral v3',
+  'Xenova/mistral-tokenizer-v1': 'Mistral v1',
+  'Xenova/gemma-tokenizer': 'Gemma',
+  'Xenova/llama-3-tokenizer': 'Llama 3',
+  'Xenova/llama-tokenizer': 'LLaMA / Llama 2',
+  'Xenova/c4ai-command-r-v01-tokenizer': 'Cohere Command-R',
+  'Xenova/t5-small': 'T5',
+  'Xenova/bert-base-cased': 'bert-base-cased',
+  '': 'Custom',
+})
 
 function App() {
   // Allow user to set tokenizer and text via URL query parameters
@@ -14,6 +31,7 @@ function App() {
   const [margins, setMargins] = useState([])
   const [outputOption, setOutputOption] = useState('text');
   const [tokenizer, setTokenizer] = useState(tokenizerParam ?? 'Xenova/gpt-4');
+  const [customTokenizer, setCustomTokenizer] = useState('');
 
   const textareaRef = useRef(null);
   const outputRef = useRef(null);
@@ -44,6 +62,13 @@ function App() {
     return () => worker.current.removeEventListener('message', onMessageReceived);
   }, []);
 
+  const resetOutput = useCallback(() => {
+    setOutputOption('text');
+    setTokenIds([]);
+    setDecodedTokens([]);
+    setMargins([]);
+  }, []);
+
   const onInputChange = useCallback((e) => {
     const model_id = tokenizer;
     const text = e.target.value;
@@ -64,8 +89,10 @@ function App() {
   const onTokenizerChange = useCallback((e) => {
     const model_id = e.target.value;
     setTokenizer(model_id);
+    if (!model_id) return;
     worker.current.postMessage({ model_id, text: textareaRef.current.value });
   }, []);
+
   return (
     <div className='w-full max-w-[720px] flex flex-col gap-4 items-center'>
 
@@ -74,21 +101,28 @@ function App() {
         <h2 className='text-lg font-normal'>Experiment with different tokenizers (running <a className="text-gray-900 underline" href="https://github.com/xenova/transformers.js">locally</a> in your browser).</h2>
       </div>
 
-
       <div>
-        <select value={tokenizer} onChange={onTokenizerChange} className="bg-gray-50 border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2">
-          <option value="Xenova/gpt-4">gpt-4 / gpt-3.5-turbo / text-embedding-ada-002</option>
-          <option value="Xenova/text-davinci-003">text-davinci-003 / text-davinci-002</option>
-          <option value="Xenova/gpt-3">gpt-3</option>
-          <option value="Xenova/grok-1-tokenizer">Grok-1</option>
-          <option value="Xenova/claude-tokenizer">Claude</option>
-          <option value="Xenova/mistral-tokenizer">Mistral</option>
-          <option value="Xenova/gemma-tokenizer">Gemma</option>
-          <option value="Xenova/llama-tokenizer">LLaMA / Llama 2</option>
-          <option value="Xenova/c4ai-command-r-v01-tokenizer">Cohere Command-R</option>
-          <option value="Xenova/t5-small">T5</option>
-          <option value="Xenova/bert-base-cased">bert-base-cased</option>
+        <select value={(tokenizer in TOKENIZER_OPTIONS && !customTokenizer) ? tokenizer : ''} onChange={(e) => {
+          resetOutput();
+          setCustomTokenizer('');
+          onTokenizerChange(e);
+        }} className="bg-gray-50 border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2">
+          {Object.entries(TOKENIZER_OPTIONS).map(([value, label]) => (
+            <option key={value} value={value}>{label}</option>
+          ))}
         </select>
+        {(!(tokenizer in TOKENIZER_OPTIONS) || customTokenizer || tokenizer === '') && (
+          <input
+            type="text"
+            placeholder="Custom tokenizer"
+            defaultValue={customTokenizer || tokenizer}
+            onChange={(e) => {
+              setCustomTokenizer(e.target.value);
+              onTokenizerChange(e);
+            }}
+            className="bg-white border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full py-1 px-2 mt-1"
+          />
+        )}
       </div>
 
 

diff --git a/scripts/supported_models.py b/scripts/supported_models.py
@@ -625,6 +625,17 @@
         #     'apple/deeplabv3-mobilevit-xx-small',
         # ],
     },
+    'mobilevitv2': {
+        # Image classification
+        'image-classification': [
+            'apple/mobilevitv2-1.0-imagenet1k-256',
+        ],
+
+        # TODO: Image segmentation
+        # 'image-segmentation': [
+        #     'apple/mobilevitv2-1.0-voc-deeplabv3',
+        # ],
+    },
     'mpt': {
         # Text generation
         'text-generation': [

diff --git a/src/models.js b/src/models.js
@@ -3829,6 +3829,21 @@ export class MobileViTForImageClassification extends MobileViTPreTrainedModel {
 
 //////////////////////////////////////////////////
 
+//////////////////////////////////////////////////
+export class MobileViTV2PreTrainedModel extends PreTrainedModel { }
+export class MobileViTV2Model extends MobileViTV2PreTrainedModel { }
+export class MobileViTV2ForImageClassification extends MobileViTV2PreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+// TODO: MobileViTV2ForSemanticSegmentation
+
+//////////////////////////////////////////////////
+
 //////////////////////////////////////////////////
 export class OwlViTPreTrainedModel extends PreTrainedModel { }
 export class OwlViTModel extends OwlViTPreTrainedModel { }
@@ -5544,6 +5559,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
     ['table-transformer', ['TableTransformerModel', TableTransformerModel]],
     ['vit', ['ViTModel', ViTModel]],
     ['mobilevit', ['MobileViTModel', MobileViTModel]],
+    ['mobilevitv2', ['MobileViTV2Model', MobileViTV2Model]],
     ['owlvit', ['OwlViTModel', OwlViTModel]],
     ['owlv2', ['Owlv2Model', Owlv2Model]],
     ['beit', ['BeitModel', BeitModel]],
@@ -5727,6 +5743,7 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
 const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
     ['vit', ['ViTForImageClassification', ViTForImageClassification]],
     ['mobilevit', ['MobileViTForImageClassification', MobileViTForImageClassification]],
+    ['mobilevitv2', ['MobileViTV2ForImageClassification', MobileViTV2ForImageClassification]],
     ['beit', ['BeitForImageClassification', BeitForImageClassification]],
     ['deit', ['DeiTForImageClassification', DeiTForImageClassification]],
     ['convnext', ['ConvNextForImageClassification', ConvNextForImageClassification]],

diff --git a/src/processors.js b/src/processors.js
@@ -246,6 +246,8 @@ export class ImageFeatureExtractor extends FeatureExtractor {
      * @param {boolean} config.do_resize Whether to resize the image.
      * @param {number} config.resample What method to use for resampling.
      * @param {number|Object} config.size The size to resize the image to.
+     * @param {boolean} [config.do_flip_channel_order=false] Whether to flip the color channels from RGB to BGR.
+     * Can be overridden by the `do_flip_channel_order` parameter in the `preprocess` method.
      */
     constructor(config) {
         super(config);
@@ -276,6 +278,8 @@ export class ImageFeatureExtractor extends FeatureExtractor {
             // We infer the pad size from the resize size
             this.pad_size = this.size
         }
+
+        this.do_flip_channel_order = this.config.do_flip_channel_order ?? false;
     }
 
     /**
@@ -571,6 +575,7 @@ export class ImageFeatureExtractor extends FeatureExtractor {
         do_pad = null,
         do_convert_rgb = null,
         do_convert_grayscale = null,
+        do_flip_channel_order = null,
     } = {}) {
         if (this.do_crop_margin) {
             // NOTE: Specific to nougat processors. This is done before resizing,
@@ -661,6 +666,18 @@ export class ImageFeatureExtractor extends FeatureExtractor {
             }
         }
 
+        if (do_flip_channel_order ?? this.do_flip_channel_order) {
+            if (imgDims[2] !== 3) {
+                throw new Error('Flipping channel order is only supported for RGB images.');
+            }
+            // Convert RGB to BGR
+            for (let i = 0; i < pixelData.length; i += 3) {
+                const temp = pixelData[i];
+                pixelData[i] = pixelData[i + 2];
+                pixelData[i + 2] = temp;
+            }
+        }
+
         const pixel_values = new Tensor('float32', pixelData, imgDims)
             .permute(2, 0, 1); // convert to channel dimension format (hwc -> chw)
 
@@ -830,6 +847,7 @@ export class EfficientNetImageProcessor extends ImageFeatureExtractor {
 
 
 export class MobileViTFeatureExtractor extends ImageFeatureExtractor { }
+export class MobileViTImageProcessor extends MobileViTFeatureExtractor { } // NOTE extends MobileViTFeatureExtractor
 export class OwlViTFeatureExtractor extends ImageFeatureExtractor {
     /** @type {post_process_object_detection} */
     post_process_object_detection(...args) {
@@ -2132,6 +2150,7 @@ export class AutoProcessor {
         WhisperFeatureExtractor,
         ViTFeatureExtractor,
         MobileViTFeatureExtractor,
+        MobileViTImageProcessor,
         OwlViTFeatureExtractor,
         Owlv2ImageProcessor,
         CLIPFeatureExtractor,

diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -2990,7 +2990,7 @@ export class PreTrainedTokenizer extends Callable {
      * ```javascript
      * import { AutoTokenizer } from "@xenova/transformers";
      * 
-     * const tokenizer = await AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1");
+     * const tokenizer = await AutoTokenizer.from_pretrained("Xenova/mistral-tokenizer-v1");
      * 
      * const chat = [
      *   { "role": "user", "content": "Hello, how are you?" },

diff --git a/src/utils/hub.js b/src/utils/hub.js
@@ -283,23 +283,34 @@ export const fetchBinary = IS_REACT_NATIVE ? fetchBinaryImpl : fetch;
 /**
  * Determines whether the given string is a valid URL.
  * @param {string|URL} string The string to test for validity as an URL.
+ * @param {string[]} [protocols=null] A list of valid protocols. If specified, the protocol must be in this list.
  * @param {string[]} [validHosts=null] A list of valid hostnames. If specified, the URL's hostname must be in this list.
  * @returns {boolean} True if the string is a valid URL, false otherwise.
  */
-function isValidHttpUrl(string, validHosts = null) {
-    // https://stackoverflow.com/a/43467144
-    let url;
-    try {
-        url = new URL(string);
-    } catch (_) {
-        return false;
-    }
-    if (validHosts && !validHosts.includes(url.hostname)) {
-        return false;
-    }
-    return IS_REACT_NATIVE
-        ? /^https?:/.test(string)
-        : url.protocol === "http:" || url.protocol === "https:";
+function isValidUrl(string, protocols = null, validHosts = null) {\
+    if (IS_REACT_NATIVE) {
+        if (protocols && !protocols.some((protocol) => string.startsWith(protocol)))
+            return false;
+        if (validHosts) {
+            const match = string.match(/^(\w+\:)\/\/(([^:\/?#]*)(?:\:([0-9]+))?)/);
+            if (!match || !validHosts.includes(match[3]))
+              return false;
+        }
+    } else {
+        let url;
+        try {
+            url = new URL(string);
+        } catch (_) {
+            return false;
+        }
+        if (protocols && !protocols.includes(url.protocol)) {
+            return false;
+        }
+        if (validHosts && !validHosts.includes(url.hostname)) {
+            return false;
+        }
+    }
+    return true;
 }
 
 /**
@@ -308,7 +319,7 @@ function isValidHttpUrl(string, validHosts = null) {
  * @param {URL|string} fromUrl The URL/path of the file to download.
  * @param {string} toFile The path of the file to download to.
  * @param {function} progress_callback A callback function that is called with progress information.
- * @returns {Promise}
+ * @returns {Promise<void>}
  */
 export async function downloadFile(fromUrl, toFile, progress_callback) {
     if (IS_REACT_NATIVE) {
@@ -361,8 +372,9 @@ export async function downloadFile(fromUrl, toFile, progress_callback) {
  */
 export async function getFile(urlOrPath) {
 
-    if (env.useFS && !isValidHttpUrl(urlOrPath)) {
-        return await FileResponse.create(urlOrPath);
+
+    if (env.useFS && !isValidUrl(urlOrPath, ['http:', 'https:', 'blob:'])) {
+        return new FileResponse(urlOrPath);
 
     } else if (typeof process !== 'undefined' && process?.release?.name === 'node') {
         const IS_CI = !!process.env?.TESTING_REMOTELY;
@@ -372,7 +384,7 @@ export async function getFile(urlOrPath) {
         headers.set('User-Agent', `transformers.js/${version}; is_ci/${IS_CI};`);
 
         // Check whether we are making a request to the Hugging Face Hub.
-        const isHFURL = isValidHttpUrl(urlOrPath, ['huggingface.co', 'hf.co']);
+        const isHFURL = isValidUrl(urlOrPath, ['http:', 'https:'], ['huggingface.co', 'hf.co']);
         if (isHFURL) {
             // If an access token is present in the environment variables,
             // we add it to the request headers.
@@ -620,7 +632,7 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
         if (env.allowLocalModels) {
             // Accessing local models is enabled, so we try to get the file locally.
             // If request is a valid HTTP URL, we skip the local file check. Otherwise, we try to get the file locally.
-            const isURL = isValidHttpUrl(requestURL);
+            const isURL = isValidUrl(requestURL, ['http:', 'https:']);
             if (!isURL) {
                 try {
                     response = await getFile(localPath);
-Original file line number
+Diff line change
@@ Expand Up @@
     </details>
-    Next, add the following CSS rules in a `style.css` file and and link it to the HTML:
+    Next, add the following CSS rules in a `style.css` file and link it to the HTML:
     ```css
     html,
@@ Expand Down @@