diff --git a/similarity/Types.py b/similarity/Types.py index 52ae6a4..0799695 100644 --- a/similarity/Types.py +++ b/similarity/Types.py @@ -10,8 +10,6 @@ import numpy as np import pandas as pd -from constants import warning_enable - class TypeSettings: """ @@ -56,7 +54,7 @@ def is_bool(column: pd.Series) -> bool: try: lower = column.map(str.lower) return lower.nunique() == 2 - except: + except TypeError: return column.nunique() == 2 @@ -299,6 +297,8 @@ def is_str_multiple(word: str): return to_return.count(to_return[0]) == len(to_return) + + def is_date(column: pd.Series) -> bool: """ Decide if type of column is date @@ -307,26 +307,37 @@ def is_date(column: pd.Series) -> bool: :param column: series for decide :return:true for date """ - - - def is_str_date(word: str): + def is_str_date(word: str) -> bool: + element = str(word).strip() try: - with warnings.catch_warnings(action=warning_enable.get_timezone()): - parse(str(word), fuzzy_with_tokens=True) # todo add timezone + with warnings.catch_warnings(): + parse(element, fuzzy=True) # todo add timezone return True - except (ParserError, OverflowError) as e: - element = str(word).strip() + except (ParserError, OverflowError): one_or_two = r'(\d{1}|\d{2})' two_or_four = r'(\d{2}|\d{4})' months = ('(January|February|March|April|May|June|July|August|' 'September|October|November|December|Jan|Feb' '|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)') + date_pattern = r'^(T(\d{6}|\d{4})(|.\d{3})(|Z))$' + pattern = date_pattern # + '$' # 1999,4 Feb 1999,4 February - pattern = r'^(\d{1}|\d{2}|\d{4}),(\d{1}|\d{2}) ' + months + pattern = pattern + '|' + r'^(\d{1}|\d{2}|\d{4}),(\d{1}|\d{2}) ' + months # 11. 4. 1999 pattern = pattern + '|' + r'^' + one_or_two + r'\. ' + one_or_two + r'\. ' + two_or_four # 1999,4February 1999,4Feb - pattern = pattern + '|' + r'^(\d{1}|\d{2}|\d{4}),(\d{1}|\d{2})' + months + pattern = pattern + '|' + r'^(\d{1}|\d{2}|\d{4}),' + one_or_two + months + # '99/12/31', '05/2/3' 00/2/3 + pattern = pattern + '|' + r'^' + two_or_four + r'/' + one_or_two + r'/' + one_or_two + # 1995W05 2024-W50 + pattern = pattern + '|' + r'^(\d{4}(W|-W)\d{2})' + # 1995W0512 2023-W03-2 + pattern = pattern + '|' + r'(\d{4}(W|-W)\d{2}(-|)' + one_or_two + ')$' + # '1995-035', '1995035', '2024340' + pattern = pattern + '|' + r'^((2|1)\d{3}-\d{3})$|(^(2|1)\d{6})$' + # epoch time 1911517200(2030 will be max for us) + pattern = pattern + '|' + r'(^\d{1,10})$' + return bool(re.match(pattern, element)) return column.apply(lambda s: is_str_date(s)).all() diff --git a/test/test_types.py b/test/test_types.py index 5dc8ae5..8ebf3b0 100644 --- a/test/test_types.py +++ b/test/test_types.py @@ -2,7 +2,6 @@ import unittest import pandas as pd -from dateutil.parser import parse from similarity.Types import is_id, is_numerical, is_bool, get_data_kind, DataKind, is_constant, is_int, is_human_gen, \ is_not_numerical, is_categorical, is_word, is_phrase, is_sentence, is_article, is_multiple, is_date, \ @@ -12,6 +11,7 @@ THIS_DIR = os.path.dirname(os.path.abspath(__file__)) + class TestID(unittest.TestCase): def setUp(self): self.directory = os.path.join(THIS_DIR, os.pardir, 'data/') @@ -460,13 +460,14 @@ def test_get_advanced_structural_type(self): class TestDateTime(unittest.TestCase): def setUp(self): - data = { + self.data = { 'MM-DD-YYYY': ['11-04-1999', '12-31-1999', '01-03-1999'], 'M-D-YY': ['11-4-99', '12-31-99', '1-3-99'], 'MM.DD.YYYY': ['11.4.1999', '12.31.1999', '1.3.1999'], 'MM.DD.YYYY_': ['11. 4. 1999', '12. 31. 1999', '1. 3. 1999'], 'MM.DD.YY': ['11.04.99', '12.31.99', '01.03.99'], - 'MM/DD/YY': ['11/04/99', '12/31/99', '01/03/99'], + 'MM/DD/YY': ['11/04/99', '12/31/99', '01/03/99', '2/4/95'], + 'DD/MM/YY': ['11/04/99', '31/12/99', '03/01/99', '4/2/95'], 'MM/DD/YYYY': ['11/4/1999', '12/31/1999', '1/3/1999'], 'YYYY,DDMon': ['1999,4Feb', '1999,31Jan', '1999,3Nov'], 'YYYY,DDMonth': ['1999,4February', '1999,31January', '1999,3November'], @@ -478,26 +479,46 @@ def setUp(self): 'DDMon,YYYY': ['4Feb,1999', '31 Jan,1999', '3 Nov, 1999 '], 'DDMonthYYYY': ['4February1999', '31January 1999', '3 November 1999 '], 'DDMonYYYY': ['4Feb1999', '31Jan 1999', '3 Nov 1999 '], + 'YY/MM/DD': ['95/2/4', '99/12/31', '05/2/3', '00/2/3'], + 'DD-Mon-YYYY': ['04-Feb-1995', '03-APR-1999', '31-JUL-1999'], + 'DD-Month-YYYY': ['4-February-1995', '03-April-1999', '31-July-1999'], + + # ISO 8601 https://www.cl.cam.ac.uk/~mgk25/iso-time.html + 'YYYY-MM-DD': ['1995-02-04', '2000-12-31', '1999-01-03', '2024-07-29', '1997-10-01', '2024-11-30'], + 'YYYY-MM': ['1995-02', '2000-12', '1999-01', '2024-07', '1997-10', '2024-11'], + 'YYYYMMDD': ['19950204', '20001231', '19990103', '20240729', '19971001', '20241130'], + 'Week': ['1997-W01', '1997W01', '1995W05', '2023-W03', '2024-W50'], # 1997-W01 or 1997W01 (first week of the year 1997) + 'Week_day': ['1997-W01-3', '1997W013', '1995W0512', '2023-W03-2', '2024-W50-1'], # 1997-W01-3 or 1997W013 (#rd day of the first week of the year 1997) 1995-W05-12 or 1995W0512 (12th day of the fifth week of the year 1995) + 'Days': ['1995-035', '1995035', '2024340'], # 1995-035 or 1995035 (35th day of the year 1995) + 'Year': ['1995', '2000', '2024'], + + # time and timezones + 'Time_:': ['12:34', '23:59:59', '00:00:00', 'T12:34:12.123', 'T12:34:12', 'T03:24'], + 'Time': ['T123412.123', 'T123412', 'T0324'], + 'Time:zone': ['12:34:56Z', 'T144515Z', 'T12:30+02:00', 'T12:30−02:00', 'T12:30+02', 'T12:30-0200'], + + 'Time_and_date': ['1995-02-04T12:34', '2000-12-31T23:59:59', '1999-01-03T00:00:00', + '2024-07-29T12:34:12.123', '1997-10-01T123412', '2024-11-30T0324'], + 'Time_zone': ['1995-02-04T12:34:56Z', '2000-12-31T14:30Z', '1999-01-03T00:00:00Z', + '2007-04-05T12:30−02:00'], + # EPOCH time - number of seconds from 1970-01-01T00:00:00Z + 'epoch': ['649213200', '1722241808'] # 649213200 (1990/7/29) 1722241808 (2024/7/29) + } + self.not_dates = { + 'first': ['1999,4Monuary'], + 'second': ['1999,4 Mon'], + 'third': ['1995-02-0400'], + 'fourth': ['1995-0350'], + 'fifth': ['17222418000'], + 'sixth': ['1722A41808'] } - self.data = pd.DataFrame(data) def test_date(self): - self.assertTrue(is_date(self.data['MM-DD-YYYY'])) - self.assertTrue(is_date(self.data['M-D-YY'])) - self.assertTrue(is_date(self.data['MM.DD.YYYY'])) - self.assertTrue(is_date(self.data['MM.DD.YY'])) - self.assertTrue(is_date(self.data['MM/DD/YY'])) - self.assertTrue(is_date(self.data['MM/DD/YYYY'])) - self.assertTrue(is_date(self.data['MonDD,YYYY'])) - self.assertTrue(is_date(self.data['MonthDD,YYYY'])) - self.assertTrue(is_date(self.data['DDMonYYYY'])) - self.assertTrue(is_date(self.data['DDMonthYYYY'])) - self.assertTrue(is_date(self.data['DDMon,YYYY'])) - self.assertTrue(is_date(self.data['DDMonth,YYYY'])) - self.assertTrue(is_date(self.data['YYYY,DD Mon'])) - self.assertTrue(is_date(self.data['YYYY,DDMonth'])) - self.assertTrue(is_date(self.data['MM.DD.YYYY_'])) - self.assertTrue(is_date(self.data['YYYY,DDMon'])) + for i in self.data: + print(f"{i} : {self.data[i]}\n") + self.assertTrue(is_date(pd.Series(self.data[i]))) + for i in self.not_dates: + self.assertFalse(is_date(pd.Series(i))) class TestTypesComparing(unittest.TestCase): @@ -536,5 +557,6 @@ def test_structural_types(self): self.assertTrue(issubclass(ALL, NONNUMERICAL)) self.assertTrue(issubclass(ALPHANUMERIC, NONNUMERICAL)) + if __name__ == '__main__': unittest.main()