diff --git a/Notebooks/1_java_unicode_char_literals.ipynb b/Notebooks/1_java_unicode_char_literals.ipynb index b6b16f3..66a662e 100644 --- a/Notebooks/1_java_unicode_char_literals.ipynb +++ b/Notebooks/1_java_unicode_char_literals.ipynb @@ -1,169 +1,239 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Unicode Character Literals" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Valid Character Literals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "java" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hqMagK5SgDZw" + }, + "source": [ + "----------\n", + "\n", + "> **How to Run This Notebook**\n", + "\n", + "You can run this notebook in Google Colab. The cell below should be run only once, and then followed by a change of runtime to \"Java (java)\". Refresh the browser before running any subsequent code. You can also run this notebook locally if you have the IJava kernel for Jupyter installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TMZxRDS3gDZx" + }, + "outputs": [], + "source": [ + "#@title Prepare Google Colab for IJava Kernel\n", + "\n", + "%%sh\n", + "# Install java kernel\n", + "wget -q https://github.com/SpencerPark/IJava/releases/download/v1.3.0/ijava-1.3.0.zip\n", + "unzip -q ijava-1.3.0.zip\n", + "python install.py\n", + "\n", + "# Install proxy for the java kernel\n", + "wget -qO- https://gist.github.com/SpencerPark/e2732061ad19c1afa4a33a58cb8f18a9/archive/b6cff2bf09b6832344e576ea1e4731f0fb3df10c.tar.gz | tar xvz --strip-components=1\n", + "python install_ipc_proxy_kernel.py --kernel=java --implementation=ipc_proxy_kernel.py" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "W94RTNlNgDZx" + }, + "source": [ + "----------" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fYVc5OlogDZy" + }, + "source": [ + "# Unicode Character Literals" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v3S5V37ZgDZy" + }, + "source": [ + "## Valid Character Literals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aBds6qRJgDZy" + }, + "outputs": [], + "source": [ + "char ch1 = 'a';\n", + "char ch2 = '東'; // (Not an ASCII character!)\n", + "// char ch3 = '𐐀'; // (Not a BMP character!)\n", + "// char ch4 = '\\'; // (Backslash is a syntax error!)\n", + "\n", + "System.out.printf(\"%s %s%n\", ch1, ch2);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IOiBN-KygDZy" + }, + "source": [ + "## Unicode Notation in Strings\n", + "\n", + "- `\\uHHHH` - where H is a case-insensitive hexadecimal character\n", + "- Only supports the Basic Multilingual Plane" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OSIJ46v7gDZy" + }, + "outputs": [], + "source": [ + "char ch5 = '\\u00EA'; // ‘ê’\n", + "String str1 = \"a\\u00ea\\u00f1\\u00fcc\"; // “aêñüc”\n", + "String str2 = \"A\\u00EA\\u00F1\\u00FCC\"; // “AêñüC”\n", + "\n", + "System.out.println(ch5);\n", + "System.out.println(str1);\n", + "System.out.println(str2);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rh0uzmJngDZy" + }, + "source": [ + "## Literal Surrogate Pairs\n", + "\n", + "- `\\uHHHH` with surrogate pairs for characters outside the BMP\n", + "- Supplementary characters are represented as **surrogate pairs**\n", + "- Number of `char`s are different than the code point count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d9xd4QXzgDZy" + }, + "outputs": [], + "source": [ + "String str3 = \"\\uD801\\uDC00\"; // ‘𐐀’\n", + "\n", + "System.out.println(str3);\n", + "System.out.println(\"length: \" + str3.length());\n", + "System.out.println(\"code points: \" + str3.codePointCount(0, str3.length()));" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sDLai-BHgDZz" + }, + "source": [ + "## Unicode Code Point Literals\n", + "\n", + "- `0xHHHHHH` specifying code point plane and code point for characters outside the BMP" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TlMXRew3gDZz" + }, + "outputs": [], + "source": [ + "int cp1 = 0x010400; // 𐐀\n", + "String str4 = new StringBuffer().appendCodePoint(cp1).toString();\n", + "String str5 = new String(Character.toChars(cp1));\n", + "\n", + "System.out.println(str4);\n", + "System.out.println(str5);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Tdzrsv2egDZz" + }, + "source": [ + "## Escape Sequences" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gufbEXI8gDZz" + }, + "outputs": [], + "source": [ + "private static void printUnicode(char ch) {\n", + " System.out.printf(\"\\\\u%04x%n\", (int) ch);\n", + "}\n", + "\n", + "\n", + "printUnicode('\\t');\n", + "printUnicode('\\b');\n", + "printUnicode('\\n');\n", + "printUnicode('\\r');\n", + "printUnicode('\\f');\n", + "printUnicode('\\'');\n", + "printUnicode('\\\"');\n", + "printUnicode('\\\\');" + ] } - }, - "outputs": [], - "source": [ - "char ch1 = 'a';\n", - "char ch2 = '東'; // (Not an ASCII character!)\n", - "// char ch3 = '𐐀'; // (Not a BMP character!)\n", - "// char ch4 = '\\'; // (Backslash is a syntax error!)\n", - "\n", - "System.out.printf(\"%s %s%n\", ch1, ch2);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Unicode Notation in Strings\n", - "\n", - "- `\\uHHHH` - where H is a case-insensitive hexadecimal character\n", - "- Only supports the Basic Multilingual Plane" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "java" - } - }, - "outputs": [], - "source": [ - "char ch5 = '\\u00EA'; // ‘ê’\n", - "String str1 = \"a\\u00ea\\u00f1\\u00fcc\"; // “aêñüc”\n", - "String str2 = \"A\\u00EA\\u00F1\\u00FCC\"; // “AêñüC”\n", - "\n", - "System.out.println(ch5);\n", - "System.out.println(str1);\n", - "System.out.println(str2);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Literal Surrogate Pairs\n", - "\n", - "- `\\uHHHH` with surrogate pairs for characters outside the BMP\n", - "- Supplementary characters are represented as **surrogate pairs**\n", - "- Number of `char`s are different than the code point count" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "java" - } - }, - "outputs": [], - "source": [ - "String str3 = \"\\uD801\\uDC00\"; // ‘𐐀’\n", - "\n", - "System.out.println(str3);\n", - "System.out.println(\"length: \" + str3.length());\n", - "System.out.println(\"code points: \" + str3.codePointCount(0, str3.length()));" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Unicode Code Point Literals\n", - "\n", - "- `0xHHHHHH` specifying code point plane and code point for characters outside the BMP" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "java" - } - }, - "outputs": [], - "source": [ - "int cp1 = 0x010400; // 𐐀\n", - "String str4 = new StringBuffer().appendCodePoint(cp1).toString();\n", - "String str5 = new String(Character.toChars(cp1));\n", - "\n", - "System.out.println(str4);\n", - "System.out.println(str5);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Escape Sequences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "java" + ], + "metadata": { + "kernelspec": { + "display_name": "Java", + "language": "java", + "name": "java" + }, + "language_info": { + "codemirror_mode": "java", + "file_extension": ".jshell", + "mimetype": "text/x-java-source", + "name": "java", + "pygments_lexer": "java", + "version": "17.0.6+9-LTS-190" + }, + "polyglot_notebook": { + "kernelInfo": { + "defaultKernelName": "csharp", + "items": [ + { + "aliases": [], + "name": "csharp" + } + ] + } + }, + "colab": { + "provenance": [], + "include_colab_link": true } - }, - "outputs": [], - "source": [ - "private static void printUnicode(char ch) {\n", - " System.out.printf(\"\\\\u%04x%n\", (int) ch);\n", - "}\n", - "\n", - "\n", - "printUnicode('\\t');\n", - "printUnicode('\\b');\n", - "printUnicode('\\n');\n", - "printUnicode('\\r');\n", - "printUnicode('\\f');\n", - "printUnicode('\\'');\n", - "printUnicode('\\\"');\n", - "printUnicode('\\\\');" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Java", - "language": "java", - "name": "java" }, - "language_info": { - "codemirror_mode": "java", - "file_extension": ".jshell", - "mimetype": "text/x-java-source", - "name": "Java", - "pygments_lexer": "java", - "version": "17.0.6+9-LTS-190" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/Notebooks/2_java_unicode_case_conversions.ipynb b/Notebooks/2_java_unicode_case_conversions.ipynb index 7cd187a..4d8e6b1 100644 --- a/Notebooks/2_java_unicode_case_conversions.ipynb +++ b/Notebooks/2_java_unicode_case_conversions.ipynb @@ -1,113 +1,184 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Unicode Case Conversions\n", - "\n", - "Uppercasing or lowercasing a character may result in more than one character. Also, depending on the position of a character in a word, you can get a different uppercase or lowercase character." - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "98uz_z16iWPq" + }, + "source": [ + "----------\n", + "\n", + "> **How to Run This Notebook**\n", + "\n", + "You can run this notebook in Google Colab. The cell below should be run only once, and then followed by a change of runtime to \"Java (java)\". Refresh the browser before running any subsequent code. You can also run this notebook locally if you have the IJava kernel for Jupyter installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lxcct87EiWPr" + }, + "outputs": [], + "source": [ + "#@title Prepare Google Colab for IJava Kernel\n", + "\n", + "%%sh\n", + "# Install java kernel\n", + "wget -q https://github.com/SpencerPark/IJava/releases/download/v1.3.0/ijava-1.3.0.zip\n", + "unzip -q ijava-1.3.0.zip\n", + "python install.py\n", + "\n", + "# Install proxy for the java kernel\n", + "wget -qO- https://gist.github.com/SpencerPark/e2732061ad19c1afa4a33a58cb8f18a9/archive/b6cff2bf09b6832344e576ea1e4731f0fb3df10c.tar.gz | tar xvz --strip-components=1\n", + "python install_ipc_proxy_kernel.py --kernel=java --implementation=ipc_proxy_kernel.py" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GNc5zEadiWPs" + }, + "source": [ + "----------" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S01vqm4viWPs" + }, + "source": [ + "# Unicode Case Conversions\n", + "\n", + "Uppercasing or lowercasing a character may result in more than one character. Also, depending on the position of a character in a word, you can get a different uppercase or lowercase character." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qoL_A4jkiWPs" + }, + "source": [ + "## Lower Case\n", + "\n", + "In Greek, the word for dog in lowercase is \"σκύλος\". Notice that the first and last letter are both sigma." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bCJPp8QJiWPs" + }, + "outputs": [], + "source": [ + "final String greekWord = \"ΣΚΎΛΟΣ\"; // dog\n", + "final String greekUpper = greekWord.toUpperCase();\n", + "final String greekLower = greekUpper.toLowerCase();\n", + "\n", + "System.out.format(\"Greek \\\"dog\\\" - \\\"%s\\\" - length %d%n\",\n", + " greekWord,\n", + " greekWord.length());\n", + "System.out.format(\"Converted to uppercase - \\\"%s\\\"%n\",\n", + " greekUpper);\n", + "System.out.format(\"Converted to lowercase - \\\"%s\\\"%n\",\n", + " greekLower);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zzmw32CyiWPt" + }, + "source": [ + "## Upper Case\n", + "\n", + "[**U+1E9E LATIN CAPITAL LETTER SHARP S**](http://unicode.org/versions/Unicode5.1.0/#Tailored_Casing_Operations)\n", + "\n", + "In particular, capital sharp s is intended for typographical representations of signage and uppercase titles, and other environments where users require the sharp s to be preserved in uppercase. Overall, such usage is rare. In contrast, standard German orthography uses the string \"SS\" as uppercase mapping for small sharp s. Thus, with the default Unicode casing operations, capital sharp s will lowercase to small sharp s, but not the reverse: small sharp s uppercases to \"SS\". In those instances where the reverse casing operation is needed, a tailored operation would be required.\n", + "\n", + "When the German word \"straße\" is converted to uppercase \"STRASSE\", notice that the string lengths are different." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RhNr09lOiWPt" + }, + "outputs": [], + "source": [ + "final String germanWord = \"straße\"; // street\n", + "final String germanUpper = germanWord.toUpperCase();\n", + "final String germanLower = germanUpper.toLowerCase();\n", + "\n", + "System.out.format(\"German \\\"street\\\" - \\\"%s\\\" - length %d%n\",\n", + " germanWord,\n", + " germanWord.length());\n", + "System.out.format(\"Converted to uppercase - \\\"%s\\\" - length %d%n\",\n", + " germanUpper,\n", + " germanUpper.length());\n", + "System.out.format(\"Converted to lowercase - \\\"%s\\\" - length %d%n\",\n", + " germanLower,\n", + " germanLower.length());" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "97uJhUn_iWPt" + }, + "source": [ + "## Incorrect Character Conversions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yVaWHFk3iWPt" + }, + "outputs": [], + "source": [ + "final char germanChar = 'ß';\n", + "final char germanCharUpper = Character.toUpperCase(germanChar);\n", + "System.out.format(\"%s becomes %s in uppercase%n\",\n", + " germanChar,\n", + " germanCharUpper);\n", + "System.out.format(\"Same char? %b\", germanChar == germanCharUpper);" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Java", + "language": "java", + "name": "java" + }, + "language_info": { + "codemirror_mode": "java", + "file_extension": ".jshell", + "mimetype": "text/x-java-source", + "name": "java", + "pygments_lexer": "java", + "version": "17.0.6+9-LTS-190" + }, + "colab": { + "provenance": [], + "include_colab_link": true + } }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Lower Case\n", - "\n", - "In Greek, the word for dog in lowercase is \"σκύλος\". Notice that the first and last letter are both sigma." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "final String greekWord = \"ΣΚΎΛΟΣ\"; // dog\n", - "final String greekUpper = greekWord.toUpperCase();\n", - "final String greekLower = greekUpper.toLowerCase();\n", - "\n", - "System.out.format(\"Greek \\\"dog\\\" - \\\"%s\\\" - length %d%n\",\n", - " greekWord,\n", - " greekWord.length());\n", - "System.out.format(\"Converted to uppercase - \\\"%s\\\"%n\",\n", - " greekUpper);\n", - "System.out.format(\"Converted to lowercase - \\\"%s\\\"%n\",\n", - " greekLower);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Upper Case\n", - "\n", - "[**U+1E9E LATIN CAPITAL LETTER SHARP S**](http://unicode.org/versions/Unicode5.1.0/#Tailored_Casing_Operations)\n", - "\n", - "In particular, capital sharp s is intended for typographical representations of signage and uppercase titles, and other environments where users require the sharp s to be preserved in uppercase. Overall, such usage is rare. In contrast, standard German orthography uses the string \"SS\" as uppercase mapping for small sharp s. Thus, with the default Unicode casing operations, capital sharp s will lowercase to small sharp s, but not the reverse: small sharp s uppercases to \"SS\". In those instances where the reverse casing operation is needed, a tailored operation would be required.\n", - "\n", - "When the German word \"straße\" is converted to uppercase \"STRASSE\", notice that the string lengths are different." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "final String germanWord = \"straße\"; // street\n", - "final String germanUpper = germanWord.toUpperCase();\n", - "final String germanLower = germanUpper.toLowerCase();\n", - "\n", - "System.out.format(\"German \\\"street\\\" - \\\"%s\\\" - length %d%n\",\n", - " germanWord,\n", - " germanWord.length());\n", - "System.out.format(\"Converted to uppercase - \\\"%s\\\" - length %d%n\",\n", - " germanUpper,\n", - " germanUpper.length());\n", - "System.out.format(\"Converted to lowercase - \\\"%s\\\" - length %d%n\",\n", - " germanLower,\n", - " germanLower.length());" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Incorrect Character Conversions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "final char germanChar = 'ß';\n", - "final char germanCharUpper = Character.toUpperCase(germanChar);\n", - "System.out.format(\"%s becomes %s in uppercase%n\",\n", - " germanChar,\n", - " germanCharUpper);\n", - "System.out.format(\"Same char? %b\", germanChar == germanCharUpper);" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Java", - "language": "java", - "name": "java" - }, - "language_info": { - "codemirror_mode": "java", - "file_extension": ".jshell", - "mimetype": "text/x-java-source", - "name": "java", - "pygments_lexer": "java", - "version": "17.0.6+9-LTS-190" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/Notebooks/3_java_unicode_numbers.ipynb b/Notebooks/3_java_unicode_numbers.ipynb index 477f5b4..fe388d2 100644 --- a/Notebooks/3_java_unicode_numbers.ipynb +++ b/Notebooks/3_java_unicode_numbers.ipynb @@ -1,53 +1,112 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Parsing Numbers" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Handle Unicode Numbers\n", - "\n", - "Demonstrates number parsing functions can handle numeric values from other (non-English) languages." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "java" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mNLXOg88iW9A" + }, + "source": [ + "----------\n", + "\n", + "> **How to Run This Notebook**\n", + "\n", + "You can run this notebook in Google Colab. The cell below should be run only once, and then followed by a change of runtime to \"Java (java)\". Refresh the browser before running any subsequent code. You can also run this notebook locally if you have the IJava kernel for Jupyter installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DialDW_qiW9B" + }, + "outputs": [], + "source": [ + "#@title Prepare Google Colab for IJava Kernel\n", + "\n", + "%%sh\n", + "# Install java kernel\n", + "wget -q https://github.com/SpencerPark/IJava/releases/download/v1.3.0/ijava-1.3.0.zip\n", + "unzip -q ijava-1.3.0.zip\n", + "python install.py\n", + "\n", + "# Install proxy for the java kernel\n", + "wget -qO- https://gist.github.com/SpencerPark/e2732061ad19c1afa4a33a58cb8f18a9/archive/b6cff2bf09b6832344e576ea1e4731f0fb3df10c.tar.gz | tar xvz --strip-components=1\n", + "python install_ipc_proxy_kernel.py --kernel=java --implementation=ipc_proxy_kernel.py" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OZwKmiHpiW9C" + }, + "source": [ + "----------" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SsrPAvDfiW9C" + }, + "source": [ + "# Parsing Numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FxpsAUd5iW9C" + }, + "source": [ + "## Handle Unicode Numbers\n", + "\n", + "Demonstrates number parsing functions can handle numeric values from other (non-English) languages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5TwVJqWPiW9C" + }, + "outputs": [], + "source": [ + "final String hindiNumber = \"१२३४५६७८९०\";\n", + "final int number = Integer.parseInt(hindiNumber);\n", + "\n", + "System.out.printf(\"%s = %d%n\", hindiNumber, number);" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Java", + "language": "java", + "name": "java" + }, + "language_info": { + "codemirror_mode": "java", + "file_extension": ".jshell", + "mimetype": "text/x-java-source", + "name": "java", + "pygments_lexer": "java", + "version": "17.0.6+9-LTS-190" + }, + "colab": { + "provenance": [], + "include_colab_link": true } - }, - "outputs": [], - "source": [ - "final String hindiNumber = \"१२३४५६७८९०\";\n", - "final int number = Integer.parseInt(hindiNumber);\n", - "\n", - "System.out.printf(\"%s = %d%n\", hindiNumber, number);" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Java", - "language": "java", - "name": "java" }, - "language_info": { - "codemirror_mode": "java", - "file_extension": ".jshell", - "mimetype": "text/x-java-source", - "name": "Java", - "pygments_lexer": "java", - "version": "17.0.6+9-LTS-190" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/Notebooks/4_java_encoding.ipynb b/Notebooks/4_java_encoding.ipynb index 0d0cde7..1d60e43 100644 --- a/Notebooks/4_java_encoding.ipynb +++ b/Notebooks/4_java_encoding.ipynb @@ -1,140 +1,219 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Encoding" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XxxqMxcaiYUE" + }, + "source": [ + "----------\n", + "\n", + "> **How to Run This Notebook**\n", + "\n", + "You can run this notebook in Google Colab. The cell below should be run only once, and then followed by a change of runtime to \"Java (java)\". Refresh the browser before running any subsequent code. You can also run this notebook locally if you have the IJava kernel for Jupyter installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tPIzNJECiYUF" + }, + "outputs": [], + "source": [ + "#@title Prepare Google Colab for IJava Kernel\n", + "\n", + "%%sh\n", + "# Install java kernel\n", + "wget -q https://github.com/SpencerPark/IJava/releases/download/v1.3.0/ijava-1.3.0.zip\n", + "unzip -q ijava-1.3.0.zip\n", + "python install.py\n", + "\n", + "# Install proxy for the java kernel\n", + "wget -qO- https://gist.github.com/SpencerPark/e2732061ad19c1afa4a33a58cb8f18a9/archive/b6cff2bf09b6832344e576ea1e4731f0fb3df10c.tar.gz | tar xvz --strip-components=1\n", + "python install_ipc_proxy_kernel.py --kernel=java --implementation=ipc_proxy_kernel.py" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YYVz1VZBiYUH" + }, + "source": [ + "----------" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PI4_voFmiYUH" + }, + "source": [ + "# Encoding" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OUXHdQ0MiYUH" + }, + "source": [ + "## Converting to Bytes\n", + "\n", + "Always specify encoding to avoid cross-platform surprises." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4uT_O9zMiYUH" + }, + "outputs": [], + "source": [ + "String original = \"Aß東𐐀\";\n", + "\n", + "byte[] utf8Bytes = original.getBytes(\"UTF-8\");\n", + "String roundTrip = new String(utf8Bytes, \"UTF-8\");\n", + "\n", + "System.out.println(roundTrip);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wU3pGbGciYUI" + }, + "source": [ + "> **Bad decoding**\n", + "\n", + "If an incorrect encoding is speccified, no exceptions may be thrown even if data gets corrupted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Xz9s8mjziYUI" + }, + "outputs": [], + "source": [ + "original = \"Aß東𐐀\";\n", + "\n", + "utf8Bytes = original.getBytes(\"UTF-8\");\n", + "roundTrip = new String(utf8Bytes, \"UTF-16\");\n", + "\n", + "// NOTE: No encoding errors are reported!\n", + "System.out.println(roundTrip);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zw6V8_2diYUI" + }, + "source": [ + "## Writing Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dxvh6UqoiYUI" + }, + "outputs": [], + "source": [ + "final String original = \"Aß東𐐀\";\n", + "\n", + "final OutputStream fos = new FileOutputStream(\"test.txt\");\n", + "final Writer wtr = new OutputStreamWriter(fos, \"UTF-8\");\n", + "wtr.write(original);\n", + "wtr.close();" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BG8qjkGaiYUI" + }, + "source": [ + "## Reading Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "boCblFt7iYUI" + }, + "outputs": [], + "source": [ + "InputStream fis = new FileInputStream(\"test.txt\");\n", + "Reader rdr = new InputStreamReader(fis, \"UTF-8\");\n", + "BufferedReader brdr = new BufferedReader(rdr);\n", + "String text = brdr.readLine();\n", + "brdr.close();\n", + "\n", + "System.out.println(text);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xN-u1x9uiYUJ" + }, + "source": [ + "If you specify an incorrect encoding when reading a file, you can get gibberish." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gCr646zviYUJ" + }, + "outputs": [], + "source": [ + "fis = new FileInputStream(\"test.txt\");\n", + "rdr = new InputStreamReader(fis, \"UTF-16\");\n", + "brdr = new BufferedReader(rdr);\n", + "text = brdr.readLine();\n", + "brdr.close();\n", + "\n", + "System.out.println(text);" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Java", + "language": "java", + "name": "java" + }, + "language_info": { + "codemirror_mode": "java", + "file_extension": ".jshell", + "mimetype": "text/x-java-source", + "name": "java", + "pygments_lexer": "java", + "version": "17.0.6+9-LTS-190" + }, + "colab": { + "provenance": [], + "include_colab_link": true + } }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Converting to Bytes\n", - "\n", - "Always specify encoding to avoid cross-platform surprises." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "String original = \"Aß東𐐀\";\n", - "\n", - "byte[] utf8Bytes = original.getBytes(\"UTF-8\");\n", - "String roundTrip = new String(utf8Bytes, \"UTF-8\");\n", - "\n", - "System.out.println(roundTrip);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> **Bad decoding**\n", - "\n", - "If an incorrect encoding is speccified, no exceptions may be thrown even if data gets corrupted." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "original = \"Aß東𐐀\";\n", - "\n", - "utf8Bytes = original.getBytes(\"UTF-8\");\n", - "roundTrip = new String(utf8Bytes, \"UTF-16\");\n", - "\n", - "// NOTE: No encoding errors are reported!\n", - "System.out.println(roundTrip);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Writing Files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "final String original = \"Aß東𐐀\";\n", - "\n", - "final OutputStream fos = new FileOutputStream(\"test.txt\");\n", - "final Writer wtr = new OutputStreamWriter(fos, \"UTF-8\");\n", - "wtr.write(original);\n", - "wtr.close();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reading Files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "InputStream fis = new FileInputStream(\"test.txt\");\n", - "Reader rdr = new InputStreamReader(fis, \"UTF-8\");\n", - "BufferedReader brdr = new BufferedReader(rdr);\n", - "String text = brdr.readLine();\n", - "brdr.close();\n", - "\n", - "System.out.println(text);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you specify an incorrect encoding when reading a file, you can get gibberish." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fis = new FileInputStream(\"test.txt\");\n", - "rdr = new InputStreamReader(fis, \"UTF-16\");\n", - "brdr = new BufferedReader(rdr);\n", - "text = brdr.readLine();\n", - "brdr.close();\n", - "\n", - "System.out.println(text);" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Java", - "language": "java", - "name": "java" - }, - "language_info": { - "codemirror_mode": "java", - "file_extension": ".jshell", - "mimetype": "text/x-java-source", - "name": "java", - "pygments_lexer": "java", - "version": "17.0.6+9-LTS-190" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/Notebooks/5_java_unicode_pattern_matching.ipynb b/Notebooks/5_java_unicode_pattern_matching.ipynb index 25b8b73..481faf8 100644 --- a/Notebooks/5_java_unicode_pattern_matching.ipynb +++ b/Notebooks/5_java_unicode_pattern_matching.ipynb @@ -1,163 +1,250 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Unicode Pattern Matching" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "szd1V-QPiZHY" + }, + "source": [ + "----------\n", + "\n", + "> **How to Run This Notebook**\n", + "\n", + "You can run this notebook in Google Colab. The cell below should be run only once, and then followed by a change of runtime to \"Java (java)\". Refresh the browser before running any subsequent code. You can also run this notebook locally if you have the IJava kernel for Jupyter installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z3PkI4W8iZHa" + }, + "outputs": [], + "source": [ + "#@title Prepare Google Colab for IJava Kernel\n", + "\n", + "%%sh\n", + "# Install java kernel\n", + "wget -q https://github.com/SpencerPark/IJava/releases/download/v1.3.0/ijava-1.3.0.zip\n", + "unzip -q ijava-1.3.0.zip\n", + "python install.py\n", + "\n", + "# Install proxy for the java kernel\n", + "wget -qO- https://gist.github.com/SpencerPark/e2732061ad19c1afa4a33a58cb8f18a9/archive/b6cff2bf09b6832344e576ea1e4731f0fb3df10c.tar.gz | tar xvz --strip-components=1\n", + "python install_ipc_proxy_kernel.py --kernel=java --implementation=ipc_proxy_kernel.py" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uxNxCW1UiZHb" + }, + "source": [ + "----------" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0BdFBckaiZHb" + }, + "source": [ + "# Unicode Pattern Matching" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0l1HIpdKiZHb" + }, + "outputs": [], + "source": [ + "boolean matches = false;" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gdAHhKXaiZHb" + }, + "source": [ + "## Case Insensitive Matching" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lG6ZDbMdiZHb" + }, + "source": [ + "In Greek, the word for dog in lowercase is \"σκύλος\". Notice that the first and last letter are both sigma." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9kT5CqB3iZHb" + }, + "outputs": [], + "source": [ + "final String lowerGreek = \"σκύλος\";\n", + "final String upperGreek = \"ΣΚΎΛΟΣ\";\n", + "final Pattern patternGreek = Pattern.compile(lowerGreek,\n", + " Pattern.CASE_INSENSITIVE\n", + " | Pattern.UNICODE_CASE);\n", + "matches = patternGreek.matcher(upperGreek).matches();\n", + "\n", + "System.out.println(matches);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_TQENPLqiZHc" + }, + "source": [ + "When a lowercase character results in more than one uppercase character, there is no match." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LVzo4QviiZHc" + }, + "outputs": [], + "source": [ + "final String lowerGerman = \"straße\";\n", + "final String upperGerman = \"STRASSE\";\n", + "final Pattern patternGerman = Pattern.compile(lowerGerman,\n", + " Pattern.CASE_INSENSITIVE\n", + " | Pattern.UNICODE_CASE);\n", + "matches = patternGerman.matcher(upperGerman).matches();\n", + "\n", + "System.out.println(matches);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o5d2KctbiZHc" + }, + "source": [ + "## Matching Numbers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YvQ7IH3iiZHc" + }, + "outputs": [], + "source": [ + "final String hindiNumber = \"१२३४५६७८९०\";" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WJ1QVNCAiZHc" + }, + "source": [ + "A naive match with a range of digits `[0-9]` does not work." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0TV99vaLiZHc" + }, + "outputs": [], + "source": [ + "final Pattern digit = Pattern.compile(\"[0-9]+\");\n", + "matches = digit.matcher(hindiNumber).matches();\n", + "\n", + "System.out.println(matches);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ChAX8IAfiZHc" + }, + "source": [ + "A slightly better regular expression with a `\\d` pattern does not work either." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cGistbfBiZHd" + }, + "outputs": [], + "source": [ + "final Pattern standard_digit = Pattern.compile(\"\\\\d+\");\n", + "matches = standard_digit.matcher(hindiNumber).matches();\n", + "\n", + "System.out.println(matches);" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-gWlBge_iZHd" + }, + "source": [ + "The best way to match digits is by matching against the Unicode Decimal Number Category (Nd), using a Unicode Category pattern `\\p{Nd}`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UliZ2YQDiZHd" + }, + "outputs": [], + "source": [ + "final Pattern unicode_digit = Pattern.compile(\"\\\\p{Nd}+\");\n", + "matches = unicode_digit.matcher(hindiNumber).matches();\n", + "\n", + "System.out.println(matches);" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Java", + "language": "java", + "name": "java" + }, + "language_info": { + "codemirror_mode": "java", + "file_extension": ".jshell", + "mimetype": "text/x-java-source", + "name": "java", + "pygments_lexer": "java", + "version": "17.0.6+9-LTS-190" + }, + "colab": { + "provenance": [], + "include_colab_link": true + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "boolean matches = false;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Case Insensitive Matching" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In Greek, the word for dog in lowercase is \"σκύλος\". Notice that the first and last letter are both sigma." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "final String lowerGreek = \"σκύλος\";\n", - "final String upperGreek = \"ΣΚΎΛΟΣ\";\n", - "final Pattern patternGreek = Pattern.compile(lowerGreek,\n", - " Pattern.CASE_INSENSITIVE\n", - " | Pattern.UNICODE_CASE);\n", - "matches = patternGreek.matcher(upperGreek).matches();\n", - "\n", - "System.out.println(matches);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When a lowercase character results in more than one uppercase character, there is no match." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "final String lowerGerman = \"straße\";\n", - "final String upperGerman = \"STRASSE\";\n", - "final Pattern patternGerman = Pattern.compile(lowerGerman,\n", - " Pattern.CASE_INSENSITIVE\n", - " | Pattern.UNICODE_CASE);\n", - "matches = patternGerman.matcher(upperGerman).matches();\n", - "\n", - "System.out.println(matches);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Matching Numbers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "final String hindiNumber = \"१२३४५६७८९०\";" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A naive match with a range of digits `[0-9]` does not work." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "final Pattern digit = Pattern.compile(\"[0-9]+\");\n", - "matches = digit.matcher(hindiNumber).matches();\n", - "\n", - "System.out.println(matches);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A slightly better regular expression with a `\\d` pattern does not work either." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "final Pattern standard_digit = Pattern.compile(\"\\\\d+\");\n", - "matches = standard_digit.matcher(hindiNumber).matches();\n", - "\n", - "System.out.println(matches);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The best way to match digits is by matching against the Unicode Decimal Number Category (Nd), using a Unicode Category pattern `\\p{Nd}`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "final Pattern unicode_digit = Pattern.compile(\"\\\\p{Nd}+\");\n", - "matches = unicode_digit.matcher(hindiNumber).matches();\n", - "\n", - "System.out.println(matches);" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Java", - "language": "java", - "name": "java" - }, - "language_info": { - "codemirror_mode": "java", - "file_extension": ".jshell", - "mimetype": "text/x-java-source", - "name": "java", - "pygments_lexer": "java", - "version": "17.0.6+9-LTS-190" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file