baojia2.py

# 低压电流互感器参数标准化转换器V1.22--转换方法
import re
import pandas as pd

# 增加变量，保存当前识别到的字段类型以及无法识别的内容
current_field_type = None
unrecognized_data = []

def process_data(data):
    # 初始化DataFrame
    columns = ['型号', '电压等级', '内孔规格','内孔尺寸', '变比', '级别', '容量', '是否开口式', '无法识别']
    df = pd.DataFrame(columns=columns)

    # 定义正则表达式模式
    # 定义中文字符范围（Unicode编码范围）
    chinese_char_range = r'[\u4e00-\u9fff]'
    model_pattern = r'[A-Z]+(?:\d+(/\d+)?)?'  # 型号

    ratio_pattern = r'\d+/\d'  # 变比
    
    open_style_pattern = r'\(?开口(?:式)?\)?'  # 匹配各种括号形式下的"开口"或"开口式"
    unit_pattern = r'[a-zA-Z]+'  # 匹配电流单位部分

    # 匹配容量的正则表达式，同时匹配纯数字、以 'va' 或 'VA' 结尾的数字，并且不包含其他非数字字符
    capacity_pattern = r'\b(?:(?!\d+[^\d.])[0]*(\d+(?:\.\d+)?)|(?=\d+[vV]?[aA]$))(?!.*[' + chinese_char_range + r'|[^0-9.]])\b'

    

    # 电压等级，处理函数
    voltage_pattern =r'^0*(\d+(\.\d+)?)[kK]?[vV]?$'  # 电压等级
    def process_voltage_level(voltage_str):
        # 匹配电压等级格式，同时允许前面有0且数值后面带单位kV、Kv或kv的情况
        match = re.match(voltage_pattern, voltage_str)
        
        if match:
            voltage_value = match.group(1).replace("kV", "").replace("KV", "").replace("kv", "").replace("k", "").replace("K", "")  # 移除单位字符
            
            # 如果数值前有多余的0，并且第一个字符就是0（意味着整个数字不是以小数点开头）
            if voltage_str.startswith('0') and len(voltage_str) > 1 and not voltage_str.startswith('0.'):
                voltage_value = '0.' + voltage_value.lstrip('0')
            # 确保电压值是一个有效的浮点数格式
            try:
                voltage_num = float(voltage_value)
            except ValueError:
                return f"{voltage_str} (无法转换为有效电压值)"

            # 验证电压等级是否在允许范围内
            if voltage_num <= 35:
                return voltage_num
            else:
                return f"{voltage_value} (不符合要求：超过35)"

        else:
            return f"{voltage_str} (不符合要求)"

    # 定义解析内孔规格的函数
        # 匹配内孔规格的正则表达式
    inner_hole_pattern = r'^(\d+)([a-zA-Z]+|[iIvVxXlLcCdDmM]+)?$'
    def process_inner_specification(spec_str):
        # 匹配内孔规格格式
        pattern = r'^(\d+)([a-zA-Z]+|[iIvVxXlLcCdDmM]+)?$'

        match = re.match(pattern, spec_str)

        if match:
            digit = match.group(1)
            suffix = match.group(2)

            # 转换罗马数字和英文字母为大写（如果存在的话）
            if suffix:
                suffix = ''.join([char.upper() for char in suffix])

            # 结合数字和后缀，确保不包含 None
            formatted_spec = f"{digit}{'' if suffix is None else suffix}"
            return formatted_spec
        else:
            return f"{spec_str} (不符合要求)"
    
    # 内孔尺寸，处理函数
    inner_diameter_pattern=r'(^(\d+(.\d+)?) ?([×xX*]) ?(\d+(.\d+)?))|(^(Φ|φ) ?(\d+(\.\d+)?))' 
    def process_inner_diameter(capacity_str):
        # 匹配内孔尺寸格式
        pattern1 = r'^(\d+(\.\d+)?) ?([×xX*]) ?(\d+(\.\d+)?)$'  # 两个数字以 "x"、"X" 或 "*" 连接的格式
        pattern2 = r'^(Φ|φ) ?(\d+(\.\d+)?)$'  # "Φ" 或 "φ" 后跟数字的格式

        match_pattern1 = re.match(pattern1, capacity_str)
        match_pattern2 = re.match(pattern2, capacity_str)

        if match_pattern1:
            diameter_product = f"{match_pattern1.group(1)}×{match_pattern1.group(4)}"
            return diameter_product
        elif match_pattern2:
            diameter_single = f"Φ{match_pattern2.group(2)}"
            return diameter_single
        else:
            return f"{capacity_str} (不符合要求)"
    
    # 级别，处理函数
    level_pattern = r'^0*(\d+(\.\d+)?) ?([sS])?级?$'  # 级别
    def process_capacity(capacity_str):
               
        match = re.match(level_pattern, capacity_str)
        
        if match:
            capacity_value_num = float(match.group(1))  # 获取数字部分并转换为浮点数
            
            # 验证数值是否小于等于3
            if capacity_value_num > 3:
                return f"{capacity_value_num} (数值超过3)"
            
            # 如果有's'或'S'单位，则保留单位并转为小写
            capacity_value = str(capacity_value_num)
            if match.group(3):
                capacity_value += 's'

            return capacity_value.strip()
        
        else:
            return f"{capacity_str} (不符合要求)"

    for line in data:
        parts = re.split(r'[-\s]+', line.strip())
        unrecognized_part = ''
        row = {'型号': '', '电压等级': '', '内孔规格': '', '内孔尺寸': '','变比': '', '级别': '', '容量': '', '是否开口式': '',  '无法识别': '' }
        
        inner_spec_matched = False # 内孔规格
        voltage_matched = False # 电压等级
        level_value = None # 级别

        for part in parts:
            current_field_type = None  # 每次循环前重置当前识别到的字段类型
                    
            if re.match(model_pattern, part):
                row['型号'] = part
                current_field_type = '型号'
            elif re.match(voltage_pattern, part) and not voltage_matched:
                row['电压等级'] =process_voltage_level(part) 
                voltage_matched = True
                current_field_type = '电压等级'
            elif re.match(inner_hole_pattern, part) and not row['内孔规格']:
                row['内孔规格'] = str(process_inner_specification(part))  # 将结果转换为字符串类型
                inner_spec_matched = True
                current_field_type = '内孔规格'
            elif re.match(inner_diameter_pattern,part) and not row['内孔尺寸']:
                row['内孔尺寸'] = str(process_inner_diameter(part))  # 将结果转换为字符串类型
                current_field_type = '内孔尺寸'
            elif re.match(ratio_pattern, part) and not row['变比']:
                part = re.sub(unit_pattern, '', part)
                row['变比'] = part
                current_field_type = '变比'
            elif re.match(level_pattern, part) and not row['级别'] :
                row['级别'] = str(process_capacity(part))
                current_field_type = '级别'
            elif re.match(capacity_pattern, part) and not row['容量']:
                # 所有的连续字母子串替换为空字符串 ''，也就是删除这些字母
                capacity_value = re.sub(r'[a-zA-Z]+', '', part) 
                row['容量'] = str(float(capacity_value))  # 将容量值转换为字符串类型
                current_field_type = '容量'
            elif re.match(open_style_pattern, part) and not row['是否开口式']:
                row['是否开口式'] = '开口式'
                current_field_type = '是否开口式'
            
            else:
                unrecognized_part += f'{part} '  # 存储无法识别的内容

        if unrecognized_part:
            unrecognized_data.append(unrecognized_part.strip())
            row['无法识别'] = unrecognized_part.strip()

        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

    return df

def main():
    with open('data.txt', 'r', encoding='utf-8') as file:
        lines = file.readlines()
        processed_data = process_data(lines)
        print(processed_data)

        try:
            processed_data.to_excel('transformed_data.xlsx', index=False, engine='openpyxl')
            print("数据已成功导出到 transformed_data.xlsx 文件中。")
        except Exception as e:
            print(f"保存数据到Excel时发生错误：{e}")

    # 输出所有无法识别的内容
    print("\n无法识别的数据内容：")
    for content in unrecognized_data:
        print(content)

if __name__ == "__main__":
    main()