-
Notifications
You must be signed in to change notification settings - Fork 0
/
baojia2.py
192 lines (152 loc) · 8.39 KB
/
baojia2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# 低压电流互感器参数标准化转换器V1.22--转换方法
import re
import pandas as pd
# 增加变量,保存当前识别到的字段类型以及无法识别的内容
current_field_type = None
unrecognized_data = []
def process_data(data):
# 初始化DataFrame
columns = ['型号', '电压等级', '内孔规格','内孔尺寸', '变比', '级别', '容量', '是否开口式', '无法识别']
df = pd.DataFrame(columns=columns)
# 定义正则表达式模式
# 定义中文字符范围(Unicode编码范围)
chinese_char_range = r'[\u4e00-\u9fff]'
model_pattern = r'[A-Z]+(?:\d+(/\d+)?)?' # 型号
ratio_pattern = r'\d+/\d' # 变比
open_style_pattern = r'\(?开口(?:式)?\)?' # 匹配各种括号形式下的"开口"或"开口式"
unit_pattern = r'[a-zA-Z]+' # 匹配电流单位部分
# 匹配容量的正则表达式,同时匹配纯数字、以 'va' 或 'VA' 结尾的数字,并且不包含其他非数字字符
capacity_pattern = r'\b(?:(?!\d+[^\d.])[0]*(\d+(?:\.\d+)?)|(?=\d+[vV]?[aA]$))(?!.*[' + chinese_char_range + r'|[^0-9.]])\b'
# 电压等级,处理函数
voltage_pattern =r'^0*(\d+(\.\d+)?)[kK]?[vV]?$' # 电压等级
def process_voltage_level(voltage_str):
# 匹配电压等级格式,同时允许前面有0且数值后面带单位kV、Kv或kv的情况
match = re.match(voltage_pattern, voltage_str)
if match:
voltage_value = match.group(1).replace("kV", "").replace("KV", "").replace("kv", "").replace("k", "").replace("K", "") # 移除单位字符
# 如果数值前有多余的0,并且第一个字符就是0(意味着整个数字不是以小数点开头)
if voltage_str.startswith('0') and len(voltage_str) > 1 and not voltage_str.startswith('0.'):
voltage_value = '0.' + voltage_value.lstrip('0')
# 确保电压值是一个有效的浮点数格式
try:
voltage_num = float(voltage_value)
except ValueError:
return f"{voltage_str} (无法转换为有效电压值)"
# 验证电压等级是否在允许范围内
if voltage_num <= 35:
return voltage_num
else:
return f"{voltage_value} (不符合要求:超过35)"
else:
return f"{voltage_str} (不符合要求)"
# 定义解析内孔规格的函数
# 匹配内孔规格的正则表达式
inner_hole_pattern = r'^(\d+)([a-zA-Z]+|[iIvVxXlLcCdDmM]+)?$'
def process_inner_specification(spec_str):
# 匹配内孔规格格式
pattern = r'^(\d+)([a-zA-Z]+|[iIvVxXlLcCdDmM]+)?$'
match = re.match(pattern, spec_str)
if match:
digit = match.group(1)
suffix = match.group(2)
# 转换罗马数字和英文字母为大写(如果存在的话)
if suffix:
suffix = ''.join([char.upper() for char in suffix])
# 结合数字和后缀,确保不包含 None
formatted_spec = f"{digit}{'' if suffix is None else suffix}"
return formatted_spec
else:
return f"{spec_str} (不符合要求)"
# 内孔尺寸,处理函数
inner_diameter_pattern=r'(^(\d+(.\d+)?) ?([×xX*]) ?(\d+(.\d+)?))|(^(Φ|φ) ?(\d+(\.\d+)?))'
def process_inner_diameter(capacity_str):
# 匹配内孔尺寸格式
pattern1 = r'^(\d+(\.\d+)?) ?([×xX*]) ?(\d+(\.\d+)?)$' # 两个数字以 "x"、"X" 或 "*" 连接的格式
pattern2 = r'^(Φ|φ) ?(\d+(\.\d+)?)$' # "Φ" 或 "φ" 后跟数字的格式
match_pattern1 = re.match(pattern1, capacity_str)
match_pattern2 = re.match(pattern2, capacity_str)
if match_pattern1:
diameter_product = f"{match_pattern1.group(1)}×{match_pattern1.group(4)}"
return diameter_product
elif match_pattern2:
diameter_single = f"Φ{match_pattern2.group(2)}"
return diameter_single
else:
return f"{capacity_str} (不符合要求)"
# 级别,处理函数
level_pattern = r'^0*(\d+(\.\d+)?) ?([sS])?级?$' # 级别
def process_capacity(capacity_str):
match = re.match(level_pattern, capacity_str)
if match:
capacity_value_num = float(match.group(1)) # 获取数字部分并转换为浮点数
# 验证数值是否小于等于3
if capacity_value_num > 3:
return f"{capacity_value_num} (数值超过3)"
# 如果有's'或'S'单位,则保留单位并转为小写
capacity_value = str(capacity_value_num)
if match.group(3):
capacity_value += 's'
return capacity_value.strip()
else:
return f"{capacity_str} (不符合要求)"
for line in data:
parts = re.split(r'[-\s]+', line.strip())
unrecognized_part = ''
row = {'型号': '', '电压等级': '', '内孔规格': '', '内孔尺寸': '','变比': '', '级别': '', '容量': '', '是否开口式': '', '无法识别': '' }
inner_spec_matched = False # 内孔规格
voltage_matched = False # 电压等级
level_value = None # 级别
for part in parts:
current_field_type = None # 每次循环前重置当前识别到的字段类型
if re.match(model_pattern, part):
row['型号'] = part
current_field_type = '型号'
elif re.match(voltage_pattern, part) and not voltage_matched:
row['电压等级'] =process_voltage_level(part)
voltage_matched = True
current_field_type = '电压等级'
elif re.match(inner_hole_pattern, part) and not row['内孔规格']:
row['内孔规格'] = str(process_inner_specification(part)) # 将结果转换为字符串类型
inner_spec_matched = True
current_field_type = '内孔规格'
elif re.match(inner_diameter_pattern,part) and not row['内孔尺寸']:
row['内孔尺寸'] = str(process_inner_diameter(part)) # 将结果转换为字符串类型
current_field_type = '内孔尺寸'
elif re.match(ratio_pattern, part) and not row['变比']:
part = re.sub(unit_pattern, '', part)
row['变比'] = part
current_field_type = '变比'
elif re.match(level_pattern, part) and not row['级别'] :
row['级别'] = str(process_capacity(part))
current_field_type = '级别'
elif re.match(capacity_pattern, part) and not row['容量']:
# 所有的连续字母子串替换为空字符串 '',也就是删除这些字母
capacity_value = re.sub(r'[a-zA-Z]+', '', part)
row['容量'] = str(float(capacity_value)) # 将容量值转换为字符串类型
current_field_type = '容量'
elif re.match(open_style_pattern, part) and not row['是否开口式']:
row['是否开口式'] = '开口式'
current_field_type = '是否开口式'
else:
unrecognized_part += f'{part} ' # 存储无法识别的内容
if unrecognized_part:
unrecognized_data.append(unrecognized_part.strip())
row['无法识别'] = unrecognized_part.strip()
df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
return df
def main():
with open('data.txt', 'r', encoding='utf-8') as file:
lines = file.readlines()
processed_data = process_data(lines)
print(processed_data)
try:
processed_data.to_excel('transformed_data.xlsx', index=False, engine='openpyxl')
print("数据已成功导出到 transformed_data.xlsx 文件中。")
except Exception as e:
print(f"保存数据到Excel时发生错误:{e}")
# 输出所有无法识别的内容
print("\n无法识别的数据内容:")
for content in unrecognized_data:
print(content)
if __name__ == "__main__":
main()