fixed issues with ?, escaped quote & added tblproperties for HQL

xnuinside · Sep 17, 2021 · e77f6c8 · e77f6c8
1 parent b2ab6b8
commit e77f6c8
Show file tree

Hide file tree

Showing 13 changed files with 285 additions and 59 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,8 +1,9 @@
-**v0.20.0**
+**v0.19.6**
 Fixes:
 
 1. Fixed issue with PARTITIONED BY multiple columns in HQL - https://github.com/xnuinside/simple-ddl-parser/issues/66
-
+2. Question symbol '?' now handled valid in strings - https://github.com/xnuinside/simple-ddl-parser/issues/64
+3. Fixed issue with escaping symbols & added tests -https://github.com/xnuinside/simple-ddl-parser/issues/63
 
 Features:
 

diff --git a/README.md b/README.md
@@ -304,6 +304,7 @@ You also can provide a path where you want to have a dumps with schema with argu
 - COMMENT
 - LOCATION
 - FIELDS TERMINATED BY, LINES TERMINATED BY, COLLECTION ITEMS TERMINATED BY, MAP KEYS TERMINATED BY
+- TBLPROPERTIES ('parquet.compression'='SNAPPY' & etc.)
 
 ### MSSQL / MySQL/ Oracle
 
@@ -342,6 +343,7 @@ You also can provide a path where you want to have a dumps with schema with argu
 1. Add more support for CREATE type IS TABLE (example: CREATE OR REPLACE TYPE budget_tbl_typ IS TABLE OF NUMBER(8,2);
 2. Add support (ignore correctly) ALTER TABLE ... DROP CONSTRAINT ..., ALTER TABLE ... DROP INDEX ...
 3. Add support for COMMENT ON statement
+4. Add support for  SKEWED BY for HQL
 
 ## non-feature todo
 
@@ -356,6 +358,17 @@ For one of the work projects I needed to convert SQL ddl to Python ORM models in
 So I remembered about Parser in Fakeme and just extracted it & improved. 
 
 ## Changelog
+**v0.19.6**
+Fixes:
+
+1. Fixed issue with PARTITIONED BY multiple columns in HQL - https://github.com/xnuinside/simple-ddl-parser/issues/66
+2. Question symbol '?' now handled valid in strings - https://github.com/xnuinside/simple-ddl-parser/issues/64
+3. Fixed issue with escaping symbols & added tests -https://github.com/xnuinside/simple-ddl-parser/issues/63
+
+Features:
+
+1. Added support for HQL statement TBLPROPERTIES - https://github.com/xnuinside/simple-ddl-parser/issues/65
+
 **v0.19.5**
 Fixes:
 

diff --git a/docs/README.rst b/docs/README.rst
@@ -336,6 +336,7 @@ HQL Dialect statements
 * COMMENT
 * LOCATION
 * FIELDS TERMINATED BY, LINES TERMINATED BY, COLLECTION ITEMS TERMINATED BY, MAP KEYS TERMINATED BY
+* TBLPROPERTIES ('parquet.compression'='SNAPPY' & etc.)
 
 MSSQL / MySQL/ Oracle
 ^^^^^^^^^^^^^^^^^^^^^
@@ -386,6 +387,7 @@ TODO in next Releases (if you don't see feature that you need - open the issue)
 #. Add more support for CREATE type IS TABLE (example: CREATE OR REPLACE TYPE budget_tbl_typ IS TABLE OF NUMBER(8,2);
 #. Add support (ignore correctly) ALTER TABLE ... DROP CONSTRAINT ..., ALTER TABLE ... DROP INDEX ...
 #. Add support for COMMENT ON statement
+#. Add support for  SKEWED BY for HQL
 
 non-feature todo
 ----------------
@@ -405,6 +407,19 @@ So I remembered about Parser in Fakeme and just extracted it & improved.
 Changelog
 ---------
 
+**v0.19.6**
+Fixes:
+
+
+#. Fixed issue with PARTITIONED BY multiple columns in HQL - https://github.com/xnuinside/simple-ddl-parser/issues/66
+#. Question symbol '?' now handled valid in strings - https://github.com/xnuinside/simple-ddl-parser/issues/64
+#. Fixed issue with escaping symbols & added tests -https://github.com/xnuinside/simple-ddl-parser/issues/63
+
+Features:
+
+
+#. Added support for HQL statement TBLPROPERTIES - https://github.com/xnuinside/simple-ddl-parser/issues/65
+
 **v0.19.5**
 Fixes:
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "simple-ddl-parser"
-version = "0.19.5"
+version = "0.19.6"
 description = "Simple DDL Parser to parse SQL & dialects like HQL, TSQL, Oracle, AWS Redshift, Snowflake, etc ddl files to json/python dict with full information about columns: types, defaults, primary keys, etc.; sequences, alters, custom types & other entities from ddl."
 authors = ["Iuliia Volkova <[email protected]>"]
 license = "MIT"

diff --git a/simple_ddl_parser/ddl_parser.py b/simple_ddl_parser/ddl_parser.py
@@ -48,15 +48,14 @@ def process_body_tokens(self, t):
         return t
 
     def t_STRING(self, t):
-        r"((\')([a-zA-Z_,`0-9:><\=\-\+.\~\%$\!() {}\[\]\/\\\"\#\*&|]*)(\')){1}"
+        r"((\')([a-zA-Z_,`0-9:><\=\-\+.\~\%$\!() {}\[\]\/\\\"\#\*&|?]*)(\')){1}"
         t.type = "STRING"
         return t
 
     def t_ID(self, t):
         r"([0-9]\.[0-9])\w|([a-zA-Z_,0-9:><\/\=\-\+\~\%$\*'\()!{}\[\]\"\`]+)"
         t.type = tok.symbol_tokens.get(t.value, "ID")
         skip_id_tokens = ["(", ")", ","]
-        print(self.lexer.is_table, t.value not in skip_id_tokens, self.lexer.lp_open)
         if t.type == "LP":
             self.lexer.lp_open += 1
             self.lexer.columns_def = True
@@ -109,7 +108,6 @@ def set_last_token(self, t):
             self.lexer.is_table = False
         elif t.type == "TABLE" or t.type == "INDEX":
             self.lexer.is_table = True
-        print(t.value, t.type)
         return t
 
     def t_newline(self, t):

diff --git a/simple_ddl_parser/dialects/hql.py b/simple_ddl_parser/dialects/hql.py
@@ -1,6 +1,4 @@
-from simple_ddl_parser.utils import check_spec
-
-from simple_ddl_parser.utils import remove_par
+from simple_ddl_parser.utils import check_spec, remove_par
 
 
 class HQL:
@@ -30,19 +28,41 @@ def p_expression_row_format(self, p):
 
         p[0]["row_format"] = format
 
-    def p_assigment(self, p):
-        """assigment : ID ID ID"""
-        p_list = list(p)
-        p[0] = {p[1]: self.lexer.state.get(p_list[-1])}
-
-    def p_expression_with_serdie(self, p):
+    def p_expression_with_serde(self, p):
         """expr : expr WITH SERDEPROPERTIES LP assigment RP"""
         p[0] = p[1]
         p_list = list(p)
         row_format = p[0]["row_format"]
         row_format["properties"] = p_list[-2]
         p[0]["row_format"] = row_format
 
+    def p_expression_tblproperties(self, p):
+        """expr : expr TBLPROPERTIES multi_assigments"""
+        p[0] = p[1]
+        p[0]["tblproperties"] = list(p)[-1]
+
+    def p_multi_assigments(self, p):
+        """multi_assigments : LP assigment
+        | multi_assigments RP
+        | multi_assigments COMMA assigment RP"""
+        p_list = remove_par(list(p))
+        p[0] = p_list[1]
+        p[0].update(p_list[-1])
+
+    def p_assigment(self, p):
+        """assigment : ID ID ID
+        |  STRING ID STRING
+        |  ID ID STRING
+        |  STRING ID ID
+        |  STRING ID"""
+        p_list = remove_par(list(p))
+        if "state" in self.lexer.__dict__:
+            p[0] = {p[1]: self.lexer.state.get(p_list[-1])}
+        else:
+            if "=" in p_list[-1]:
+                p_list[-1] = p_list[-1].split("=")[-1]
+            p[0] = {p_list[1]: p_list[-1]}
+
     def p_expression_comment(self, p):
         """expr : expr COMMENT STRING"""
         p[0] = p[1]
@@ -83,9 +103,8 @@ def p_expression_partitioned_by_hql(self, p):
         """expr : expr PARTITIONED BY pid_with_type """
         p[0] = p[1]
         p_list = list(p)
-        print(list(p_list))
         p[0]["partitioned_by"] = p_list[-1]
-    
+
     def p_pid_with_type(self, p):
         """pid_with_type :  LP column
         | pid_with_type COMMA column

diff --git a/simple_ddl_parser/dialects/sql.py b/simple_ddl_parser/dialects/sql.py
@@ -2,7 +2,7 @@
 from copy import deepcopy
 from typing import Dict, List
 
-from simple_ddl_parser.utils import remove_par
+from simple_ddl_parser.utils import check_spec, remove_par
 
 
 class AfterColumns:
@@ -1104,7 +1104,7 @@ def p_pkey(self, p):
     def p_comment(self, p):
         """comment : COMMENT STRING"""
         p_list = remove_par(list(p))
-        p[0] = {"comment": p_list[-1]}
+        p[0] = {"comment": check_spec(p_list[-1])}
 
     def p_tablespace(self, p):
         """tablespace : TABLESPACE ID

diff --git a/simple_ddl_parser/output/dialects.py b/simple_ddl_parser/output/dialects.py
@@ -127,7 +127,7 @@ def key_cleaning(table_data: Dict, output_mode: str) -> Dict:
     else:
         table_data = clean_up_output(table_data, hql_clean_up_list)
         # todo: need to figure out how workaround it normally
-        if "_ddl_parser_comma_only_str" == table_data["fields_terminated_by"]:
+        if "_ddl_parser_comma_only_str" == table_data.get("fields_terminated_by"):
             table_data["fields_terminated_by"] = ","
     return table_data
 

diff --git a/simple_ddl_parser/parser.py b/simple_ddl_parser/parser.py
@@ -78,6 +78,7 @@ def pre_process_data(self, data):
             .replace("\\u2019", "'")
             .replace("'\\t'", "'pars_m_t'")
             .replace("'\\n'", "'pars_m_n'")
+            .replace("\\'", "pars_m_single")
             .replace("\\t", " ")
         )
         return data

diff --git a/simple_ddl_parser/tokens.py b/simple_ddl_parser/tokens.py
@@ -76,6 +76,7 @@
     "WITH": "WITH",
     "CLUSTER": "CLUSTER",
     "SERDEPROPERTIES": "SERDEPROPERTIES",
+    "TBLPROPERTIES": "TBLPROPERTIES",
     # oracle
     "STORAGE": "STORAGE",
     "TABLESPACE": "TABLESPACE",
@@ -91,7 +92,7 @@
 
 tokens = tuple(
     set(
-        ["ID", "DOT", "STRING", "LP", "RP", "LT", "RT", "COMMAT"]
+        ["ID", "DOT", "STRING", "LP", "RP", "LT", "RT", "COMMAT", "NEWLINE"]
         + list(defenition_statements.values())
         + list(common_statements.values())
         + list(columns_defenition.values())

diff --git a/simple_ddl_parser/utils.py b/simple_ddl_parser/utils.py
@@ -13,11 +13,20 @@ def remove_par(p_list: List[str]) -> List[str]:
     "'pars_m_t'": "'\t'",
     "'pars_m_n'": "'\n'",
     "'pars_m_dq'": '"',
+    "pars_m_single": "'",
 }
 
 
 def check_spec(value: str) -> str:
-    return spec_mapper.get(value, value)
+    replace_value = spec_mapper.get(value)
+    if not replace_value:
+        for item in spec_mapper:
+            if item in value:
+                replace_value = value.replace(item, spec_mapper[item])
+                break
+        else:
+            replace_value = value
+    return replace_value
 
 
 def find_symbols_not_in_str(str_1: str, str_2: str) -> str: