Initial commit

dde629b1 · 郑博 · dde629b1 · dde629b1 · dde629b1 · dde629b1
Commit dde629b1 authored Jul 22, 2025 by 郑博
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
+# 默认忽略的文件
+/shelf/
+/workspace.xml
+# 基于编辑器的 HTTP 客户端请求
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/.idea/battery_health_new.iml
+++ b/.idea/battery_health_new.iml
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/.venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Remote Python 3.6.3 (sftp://root@192.168.1.99:22/root/hadoop-env/bin/python3.6)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
\ No newline at end of file
--- a/.idea/deployment.xml
+++ b/.idea/deployment.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PublishConfigData" autoUpload="Always" serverName="root@192.168.1.99:22 password" remoteFilesAllowedToDisappearOnAutoupload="false">
+    <serverData>
+      <paths name="root@192.168.1.99:22 password">
+        <serverdata>
+          <mappings>
+            <mapping deploy="/home/battery_health_new" local="$PROJECT_DIR$" />
+          </mappings>
+        </serverdata>
+      </paths>
+    </serverData>
+    <option name="myAutoUpload" value="ALWAYS" />
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Remote Python 3.6.3 (sftp://root@192.168.1.99:22/root/hadoop-env/bin/python3.6)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.6.3 (sftp://root@192.168.1.99:22/root/hadoop-env/bin/python3.6)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/battery_health_new.iml" filepath="$PROJECT_DIR$/.idea/battery_health_new.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
--- a/battery_health/__init__.py
+++ b/battery_health/__init__.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+import sys
+import battery_health
+import findspark
+findspark.init()
+
+print("当前Python路径:", sys.executable)
+sys.path.append('/home/hadoop/spark3.2.4/python')
+# sys.path.append('/usr/spark/spark-2.4.6-bin-hadoop2.7/python')
+# sys.path.append('C:/tools/spark-2.4.8-bin/python')
+# sys.path.append('/usr/lib/spark-current/python')
+
+import os
+os.environ['JAVA_HOME'] = '/home/soft/jdk1.8.0_271'
+# os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.6'
+os.environ['PYSPARK_PYTHON'] = '/root/hadoop-env/bin/python'
+# 远程调试时指定配置文件目录
+os.environ["HADOOP_CONF_DIR"] = "/home/hadoop/hive3.1.3/conf"
+os.environ["SPARK_CONF_DIR"] = "/home/hadoop/spark3.2.4/conf"
+
+
+from pyspark import SparkConf
+from pyspark.sql import SparkSession
+
+spark_conf = SparkConf()
+spark_conf.setAppName("huafon-battery-health")
+spark_conf.setMaster("local[*]")
+spark_conf.set("spark.driver.memory", "60g")
+spark_conf.set("spark.executor.memory","10g") # 每个executor的内存
+#spark_conf.set("hive.exec.dynamic.partition.mode", "nonstrict")
+spark_conf.set("spark.sql.crossJoin.enabled", "true")
+spark_conf.set("spark.execution.arrow.enabled", "true")  # 启用Arrow数据，加速计算
+spark_conf.set("spark.shuffle.service.enabled", "true")
+spark_conf.set("spark.storage.memoryFraction", "0.6")  # 设置储存因子，即储存用的内存占已分配内存的0.6
+
+# 添加Hive相关优化配置
+spark_conf.set("hive.exec.dynamic.partition", "true")
+spark_conf.set("hive.exec.dynamic.partition.mode", "nonstrict")
+spark_conf.set("hive.exec.max.dynamic.partitions", "1000")
+
+#spark_conf.set("spark.driver.maxResultSize", "4g") # 调试用，设置结果大小为2048MB
+# spark_conf.setMaster("local[*]")
+# spark_conf.set("spark.logConf", "false")
+# spark_conf.set("spark.dynamicAllocation.enabled", "true")  # 设置动态分配内存开关
+# spark_conf.set("spark.sql.broadcastTimeout", "300")  # 设置最大广播超时时间
+# spark_conf.set("spark.default.parallelism", "500")  # 设置最大并行数目
+# spark_conf.set("spark.driver.maxResultSize", "4g")
+# spark_conf.set("spark.sql.queryExecutionListeners", "")
+# spark_conf.set('spark.sql.execution.arrow.enabled', 'true')
+# spark_conf.set("spark.scheduler.mode", "FAIR")  # 设置公平调度模式
+
+spark = SparkSession.builder.config(conf=spark_conf).enableHiveSupport().getOrCreate()
+
+
+
--- a/battery_health/__pycache__/__init__.cpython-313.pyc
+++ b/battery_health/__pycache__/__init__.cpython-313.pyc
--- a/battery_health/__pycache__/__init__.cpython-36.pyc
+++ b/battery_health/__pycache__/__init__.cpython-36.pyc
--- a/battery_health/__pycache__/configure.cpython-36.pyc
+++ b/battery_health/__pycache__/configure.cpython-36.pyc
--- a/battery_health/__pycache__/data_load.cpython-36.pyc
+++ b/battery_health/__pycache__/data_load.cpython-36.pyc
--- a/battery_health/__pycache__/data_repair.cpython-36.pyc
+++ b/battery_health/__pycache__/data_repair.cpython-36.pyc
--- a/battery_health/__pycache__/main_process.cpython-36.pyc
+++ b/battery_health/__pycache__/main_process.cpython-36.pyc
--- a/battery_health/__pycache__/udf_collections.cpython-36.pyc
+++ b/battery_health/__pycache__/udf_collections.cpython-36.pyc
--- a/battery_health/configure.py
+++ b/battery_health/configure.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+"""
+本文件用来记录不同电站各个设备数据所对应的pid_name以及dev_id。
+需要在输入的时候注意：
+details中的参数类型需要统一，例如’vol_lower‘，所有电站都需要是double类型，不能有的是int型有的是double型。
+"""
+# 当前需要使用的电站
+# station_list = ['xmei', 'hnc']
+station_list = ['hnc']
+#station_list = ['shtb']
+
+# 需要被写入influxdb中的点号名称
+pid_name_list = ['RealSOH', 'TheorySOH', 'UsedRecycleTimes', 'RemainingRecycleTimes', 'RemainingRecycleDays',
+                 'ForecastProfitMonth', 'BatteryMileageAmount', 'BatteryEfficiencyDay', 'AvgFullCharge',
+                 'AvgFullDischarge', 'BatteryMileageDay', 'ChargeEnergyRetention', 'DischargeEnergyRetention',
+                 'ChargeEndSOC', 'DischargeEndSOC', 'ChargeEndMaxVoltDiff', 'DischargeEndMaxVoltDiff',
+                 'ChargeEndVoltSTD', 'DischargeEndVoltSTD', 'ChargeEndVoltageDiff', 'DischargeEndVoltageDiff',
+                 'ChargeEndVoltageDeviation', 'DischargeEndVoltageDeviation', 'HealthScore', 'MaxCellTempRange']
+
+# 未完成的参数
+consistency_weights = [0.1, 0.3, 0.6]
+consistency_score_weights = {'temp_weight': 0.5, 'vol_weight': 0.5}
+health_score_weights = {'vol_cons': 0.3, 'temp_cons': 0.15, 'cap_cons': 0.5, 'res_cons': 0.05}
+cell_health_range = 3
+# cell_health_range = 1
+
+general_map = {'hnc': {'id': {'enu': {'dev_id': ['628234'], 'pid_name': ['SystemLoad', 'ActivePower']},
+                              'batu': {'dev_id': ['628607'], 'pid_name': []},
+                              'batc': {'dev_id': ['628241', '628422', '628568', '628608',
+                                                  '628656', '629330', '629967', '629968']},
+                              'PCS': {'dev_id': ['628549'], 'pid_name': ['DcActivePower']},
+                              'cell': {}},
+                       'details': {'station_id': '628421',
+                                   # 'full_charge_index': 85,
+                                   'full_charge_index': 40,
+                                   'theory_decay_ration': -25,
+                                   'power_freq_change': '2019-01-31 23:59:00', 'sample_freq_former': 15 * 60,
+                                   'sample_freq_latter': 1 * 10, 'cell_max_capacity': 1215.6, 'profit_per_kwh': 0.36,
+                                   'standard_capacity': 2334, 'motion_time': '2021-07-10', 'total_discharge': 3000000,
+                                   'threshold': 5, 'freq': '10s',  'charge_energy_pid': '11152001400284',
+                                   'discharge_energy_pid': '11152001400286', 'forecast_time_gap': 180, 'percent': 5,
+                                   'cell_total': 1920, 'soc_diff_filter': 60, 'stop_soh': 50,
+                                   'ChargeEndSOC': 80, 'DischargeEndSOC': 10,
+                                   'vol_lower': 1.0, 'vol_upper': 5.0, 'temp_lower': 10.0, 'temp_upper': 60.0,
+                                   'cell_soc_flag': 1, 'theory_info': {'theory_cycles': 4000, 'theory_stop_soh': 80},
+                                   'charge_stop_vol': 3.6, 'discharge_stop_vol': 2.8,
+                                   'energy_zero_date': '2024-01-01',}
+                       },
+               'xmei': {'id': {'enu': {'dev_id': ['600906'], 'pid_name': ['SystemLoad', 'ActivePower']},
+                               'batu': {'dev_id': ['601052'], 'pid_name': []},
+                               'batc': {'dev_id': ['601051', '908342']},
+                               'cell': {}},
+                        'details': {'station_id': '580828', 'display_time': '2022-06-17', 'full_charge_index': 70,
+                                    'discharge_energy_pid': '1114999998508', 'threshold': 5,
+                                    'charge_energy_pid': '1114999998507', 'power_freq_change': '2019-01-31 23:59:00',
+                                    'sample_freq_former': 1, 'sample_freq_latter': 1,
+                                    'profit_per_kwh': 0.36, 'standard_capacity': 422, 'motion_time': '2022-05-10',
+                                    'total_discharge': 3000000, 'init_display': 115559.0,
+                                    'forecast_time_gap': 360, 'theory_decay_ration': -25, 'unit_time': '1S',
+                                    'percent': 5, 'cell_total': 480, 'cell_max_capacity': 883.2, 'stop_soh': 50,
+                                    'vol_diff_threshold': 0, 'soc_diff_filter': 50, 'freq': '10s',
+                                    'vol_lower': 1.0, 'vol_upper': 5.0, 'temp_lower': 10.0, 'temp_upper': 60.0,
+                                    'ChargeEndSOC': 80, 'DischargeEndSOC': 20,
+                                    'cell_soc_flag': 0, 'theory_info': {'theory_cycles': 4000, 'theory_stop_soh': 80},
+                                    'charge_stop_vol': 3.6, 'discharge_stop_vol': 2.8}
+                        },
+               'shtb': {'id': {'enu': {'dev_id': ['21202'], 'pid_name': ['SystemLoad', 'ActivePower']},
+                               'batu': {'dev_id': ['21145', '23180', '23112', '23053'], 'pid_name': []},
+                               'batc': {'dev_id': ['23054', '23026', '23027', '23249',
+                                                   '23250', '23163', '23251', '23164']},
+                               'cell': {}},
+                        'details': {'station_id': '20259', 'power_freq_change': '2019-01-31 23:59:00', 'threshold': 5,
+                                    'sample_freq_former': 15 * 60, 'sample_freq_latter': 1 * 10, 'full_charge_index': 94,
+                                    'forecast_time_gap': 360, 'theory_decay_ration': -25, 'unit_time': '10S',
+                                    'percent': 5, 'cell_total': 1856, 'stop_period': ['2019-12-04', '2020-09-30'],
+                                    'vol_diff_threshold': 0, 'profit_per_kwh': 0.36, 'standard_capacity': 1100,
+                                    'motion_time': '2019-01-01', 'total_discharge': 3000000, 'stop_soh': 40,
+                                    'cell_max_capacity': 750.0, 'soc_diff_filter': 60, 'freq': '10s',
+                                    'discharge_energy_pid': '100010043011128', 'cell_soc_flag': 0,
+                                    'theory_info': {'theory_cycles': 2000, 'theory_stop_soh': 60},
+                                    'charge_energy_pid': '100010043011126', 'energy_zero_date': '2019-04-02',
+                                    'ChargeEndSOC': 90, 'DischargeEndSOC': 10,
+                                    'vol_lower': 1.0, 'vol_upper': 5.0, 'temp_lower': 15.0, 'temp_upper': 45.0,
+                                    'charge_stop_vol': 3.6, 'discharge_stop_vol': 2.8}
+                        }
+               }
+general_map = {i:general_map[i] for i in station_list}
+
+dev_id_list = []
+for i in general_map.keys():
+    for j in ['enu', 'batu', 'batc']:
+        extra_ids = general_map[i]['id'][j]['dev_id']
+        dev_id_list += extra_ids
+
+station_id = [general_map[i]['details']['station_id'] for i in general_map.keys()]
+station_id = str(station_id).replace('[', '(').replace(']', ')')
+
+# 基础表
+data_table = 'prod.ods_data_meas_hs'
+pid_table = 'prod.dwb_dev_pid_meta_h'
+rel_table = 'prod.dwb_dev_relation_h'
+# 中间表
+state_table = 'prod.dwb_dev_state_dd'
+cell_base_table = "prod.dwb_cell_base_data"
+cell_capacity_table = "prod.dwb_cell_capacity_dd"
+cell_capacity_conf_table = "prod.dwb_cell_capacity_conf_dd"
+dev_table = "prod.dwb_dev_data_dd"  # 中间过程表
+cell_health_1 = "prod.dm_cell_health_dd"
+cell_health_table = "prod.dm_cell_health_second_dd"  # 单体电池健康分数据表
+batu_health_table = "prod.dm_batu_health_second_dd"  # 电池单元健康分数据表
+cap_table = "prod.dm_dev_capacity_dd"  # 设备电量表
+cell_lithium_table = "prod.dwb_cell_lithium_dd"  # 单体电池析锂数据表
+cell_resistance_table = "prod.dwb_cell_resistance_dd"  # 单体电池内阻数据表
+cell_circuit_table = "prod.dwb_cell_micro_circuit_dd"  # 单体电池微短路数据表
+daily_batu_health_table = "prod.dwb_daily_batu_health_dd"  # 电池单元每日健康分数据表
+end_record_table = "prod.dwb_dev_endRecord_dd"  # 设备充放电末端记录表
+
+# state_table = 'cx.dwb_dev_state_dd'
+# cell_base_table = "cx.dwb_cell_base_data"
+# cell_capacity_table = "cx.dwb_cell_capacity_10_dd"
+# cell_capacity_conf_table = "cx.dwb_cell_capacity_conf_dd"
+# dev_table = "cx.dwb_dev_data_dd"
+# cell_health_1 = "cx.dm_cell_health_dd"
+# cell_health_table = "cx.dm_cell_health_second_dd"
+# batu_health_table = "cx.dm_batu_health_second_dd"
+# cap_table = "cx.dm_dev_capacity_dd"
+# cell_lithium_table = "cx.dwb_cell_lithium_dd"
+# cell_resistance_table = "cx.dwb_cell_resistance_dd"
+# cell_circuit_table = "cx.dwb_cell_micro_circuit_dd"
+# daily_batu_health_table = "cx.dwb_daily_batu_health_dd"
+
+influxdb_table = "prod.app_influxdb_data"
+batu_tempCons_table = "prod.dm_batu_tempCons_dd"  # 电池单元温度一致性数据表
+battery_operation_table = "prod.dm_capacity_estimate_dd"  # 电池运维容量数据表
+# statistic_mid_table = "prod.dwb_statistic_mid_dd" # 新启用的过程表，用于详情页（统计数据）
+statistic_mid_table = "prod.dwb_statistic_interm_dd"
+# 结果表
+health_data = "test.health_data"
+health_score = "test.health_score"
+soh_table = "test.decay_data"
+
+# 暂时未启用的数据表
+cell_ica_table = "prod.dwb_cell_ica_data"  # 单体电池ica数据表
+voltage_exception_table = "prod.dwb_exception_record_dd"  # 单体电池异常电压记录
+discharge_energy_table = "prod.dwb_dev_dce_dd"
--- a/battery_health/data_load.py
+++ b/battery_health/data_load.py
+#!/usr/bin/env python
+# encoding: utf-8
+
+import pandas as pd
+from dateutil.relativedelta import relativedelta
+import pyspark.sql.types as t
+import pyspark.sql.functions as F
+from battery_health import spark, configure
+
+"""
+理想的情况：
+一个table对应一个通用的数据获取方法，
+
+目前采用的规范：
+对于数据获取方法来说，为了统一性，传入值必须是list格式（就算是一个值，也需要用list格式传入）。
+所有的将 ['xxx', 'xxx'] 转为 ('xxx', 'xxx')的工作，需要在方法内部完成，不要在外部完成。
+
+涉及到 spark.sql 语句的地方，尽量把这些语句缩小成最基础的方法来实现。
+由此实现，一个表只对应一个spark.sql方法。
+
+"""
+
+# table : prod.ods_data_meas_hs
+def get_ods_data(pid, date, targets=None, flag=None, station=None, hour=None):
+    """
+    :param pid: [list of str] or str ,点号
+    :param date: [list of str] or str ,‘xxxx-xx-xx’格式日期
+    :param targets: [list of str] or str ,需要的列名
+    :param flag: [list of int] or int ,判断符
+    :param station: [list of str] or str ,电站（‘shtb’）
+    :param hour: [list of str] or str ,小时（0~24）
+
+    :return: prod.ods_data_meas_hs[targets]
+    """
+
+    # 传入数据校验，将传出的格式统一为('xxx','xxx','xxx')格式
+    stations = general_transform(station,configure.station_list)
+    dates = general_transform(date)
+    pids = general_transform(pid)
+    if targets is None:
+        targets = '*'
+    targets = str(targets).replace("'",'').replace('[','').replace(']','')
+    # 准备sql语句，因为hour分区用到的场景不多，所以单独拿出来。
+    sql_str = "select {targets} from prod.ods_data_meas_hs where station in {stations} and dt in {dates} and pid in {pid}" \
+        .format(targets=targets, stations=stations, dates=dates, pid=pids)
+
+    print("-------------------hour",hour)
+    sql_str = sql_str + sql_extra('hour', hour) + sql_extra('flag', flag)
+
+    # 数据获取，修改时间戳格式为 'yyyy-MM-dd HH:mm:ss'。
+    # print("data_load method get_ods_data sql:" + sql_str)
+    data = spark.sql(sql_str)
+    data = data.withColumn('times', F.col('times').astype(t.DoubleType())) \
+        .withColumn('times', F.col('times') / 1000000000) \
+        .withColumn('times', F.from_unixtime(F.col('times'), 'yyyy-MM-dd HH:mm:ss'))
+    # data.show()
+    return data
+
+# table : prod.dwb_dev_pid_meta_h
+def get_dev_pid(pid_name=None, targets=None, date=None, station_id_list=None, tier=None):
+    """
+    这里传入date的list的意义不大，取的是list中最后一天的可用数据。
+    :param pid_name: [list of str] or str ,数据类型（必要参数）
+    :param targets: [list of str] or str ,需要的列名
+    :param date: [list of date] or date ,时间（所有格式都可以）
+    :param station_id_list: [list of str] or str ,电站id（设备表和关系表只能用id）
+    :param tier: [list of str] or str ,层级（默认enu+batu+batc）
+
+    :return: prod.dwb_dev_pid_meta_h[targets]
+    """
+
+    # 传入数据的校验，日期单独处理。
+    if date is None:
+        date = '99991230'
+    elif type(date) == type(''):
+        date = date.replace('-', '')
+    elif type(date) == type([]):
+        date = max(date).replace('-', '')
+
+    if targets is None:
+        targets = '*'
+    targets = str(targets).replace("'",'').replace('[','').replace(']','')
+
+    tiers = general_transform(tier, ['BatteryCluster', 'BatteryUnit', 'Storage'])
+    default_id_list = [configure.general_map[i]['details']['station_id'] for i in configure.station_list]
+    station_ids = general_transform(station_id_list, default_id_list)
+
+    sql_str = "select {targets} from {table} where station_id in {station_ids} and s_date <= '{date}' " \
+              "and e_date > '{date}' and dev_type in {tiers}".format(table = configure.pid_table, targets=targets,
+                                                                     date=date, station_ids=station_ids, tiers=tiers)
+    sql_str = sql_str + sql_extra('pid_name', pid_name)
+    print("data_load method get_dev_pid sql:" + sql_str)
+    # 获取数据
+    dev_pid = spark.sql(sql_str)
+
+    return dev_pid
+
+# table : prod.dwb_dev_relation_h
+def get_dev_rel(targets=None, date=None, station_id_list=None, cell_id=None, batc_id=None, batu_id=None, enu_id=None):
+    """
+    这里传入date的list的意义不大，取的是list中最后一天的可用数据。
+    :param targets: [list of str] or str ,需要的列名
+    :param date: [list of date] or date ,时间（所有格式都可以）
+    :param station_id_list: [list of str] or str ,电站id（设备表和关系表只能用id）
+    :param cell_id:
+    :param batc_id:
+    :param batu_id:
+    :param enu_id:
+
+    :return: prod.dwb_dev_relation_h[targets]
+    """
+
+    # 传入数据的校验，日期单独处理。
+    if date is None:
+        date = '99991230'
+    elif type(date) == type(''):
+        date = date.replace('-', '')
+    elif type(date) == type([]):
+        date = max(date).replace('-', '')
+    if targets is None:
+        targets = '*'
+    targets = str(targets).replace("'", '').replace('[', '').replace(']', '')
+
+
+    default_id_list = [configure.general_map[i]['details']['station_id'] for i in configure.station_list]
+    station_ids = general_transform(station_id_list, default_id_list)
+
+    sql_str = "select {targets} from {table} where station_id in {station_ids} and s_date < '{date}' " \
+              "and e_date > '{date}'"\
+        .format(table = configure.rel_table, targets=targets, date=date, station_ids=station_ids)
+    sql_str = sql_str + sql_extra('cell_id', cell_id) + sql_extra('batc_id', batc_id) + sql_extra('batu_id', batu_id) \
+              + sql_extra('enu_id', enu_id)
+    print("data_load get_dev_rel sql_str:" + sql_str)
+    # 获取数据
+    dev_rel = spark.sql(sql_str)
+
+    return dev_rel
+
+# table: prod.dwb_dev_state_dd
+def get_state(date, targets=None, dev_id=None, state=None, full_charge=None, station=None):
+    """
+    :param date: [list of 'xxxx-xx-xx'] or 'xxxx-xx-xx' ,日期
+    :param targets: [list of str] or str ,需要的列名
+    :param dev_id: [list of str] or str ,设备ID
+    :param state: 'charge' or 'discharge' ,状态
+    :param full_charge: 0 or 1 ,是否满充满放
+    :param station: [list of str] or str ,电站
+
+    :return: prod.dwb_dev_state_dd[targets]
+    """
+    stations = general_transform(station,configure.station_list)
+    dates = general_transform(date)
+    if targets is None:
+        targets = '*'
+    targets = str(targets).replace("'",'').replace('[','').replace(']','')
+    sql_str = "select {targets} from prod.dwb_dev_state_dd where station in {stations} and dt in {dates}"\
+        .format(targets=targets, stations=stations, dates=dates)
+    sql_str = sql_str + sql_extra('full_charge', full_charge) + sql_extra('state', state) + sql_extra('dev_id', dev_id)
+    print("data_load get_state sql_str:" + sql_str)
+    data = spark.sql(sql_str)
+
+    return data
+
+# table: prod.dwb_cell_base_data
+def get_cell_base(date, targets=None, cell_id=None, station=None):
+    stations = general_transform(station, configure.station_list)
+    dates = general_transform(date)
+    if targets is None:
+        targets = '*'
+    targets = str(targets).replace("'",'').replace('[','').replace(']','')
+    sql_str = "select {targets} from {table} where station in {stations} and dt in {dates}"\
+        .format(table=configure.cell_base_table, targets=targets, stations=stations, dates=dates)
+    sql_str = sql_str + sql_extra('cell_id', cell_id)
+    print("data_load get_cell_base sql_str:" + sql_str)
+    data = spark.sql(sql_str)
+    return data
+
+# 获取表数据的通用方法：表名+日期
+def get_data_general(table, date):
+    dates = general_transform(date)
+    sql_str = "select * from {table} where dt in {dates}".format(table=table, dates=dates)
+    print("data_load get_data_general sql:", sql_str)
+    data = spark.sql(sql_str)
+    return data
+
+"""小组件"""
+
+# 转换数值为适合sql使用的str格式
+def general_transform(data, default=None):
+    """
+    :param data: str 或 list 格式
+    :param default: 默认值，在传入值为None时，传入值的默认值。
+    :return: ('xxx') 或 ('xxx','xxx')
+    这是一个用于数据表方法的传入数据的校验，将传入的数据以合适的格式传出。
+    不适用于所有的数据格式，例如：日期的格式有 '2022-02-02'和'20220202'，这种情况请自己解决。
+    """
+
+    if data is not None:
+        if type(data) == type([]):
+            data = str(data).replace('[', '(').replace(']', ')')
+        elif type(data) == type(''):
+            data = str([data]).replace('[', '(').replace(']', ')')
+    else:
+        data = general_transform(default)
+
+    return data
+
+# sql语句中"and xxx='xx'"的部分
+def sql_extra(name=None, extra=None):
+    """
+    :param name: 字段类型
+    :param extra: 额外内容
+    :return: 处理后语句
+    """
+    if extra is not None:
+        extra = general_transform(extra,'')
+        extra_str = ' and {name} in '.format(name = name) + extra
+    else:
+        extra_str = ''
+    return extra_str
+
+# 获取configure中的各种指标：频率、满充满放判断依据、阈值。
+def get_index(index):
+    """
+    :param index: 数据类型
+    :return: ['station', 指定数据]
+    """
+
+    type_dict = {type('str'):t.StringType(),type(1):t.IntegerType(), type(1.0):t.DoubleType()}
+    index_type = type_dict[type(configure.general_map[configure.station_list[0]]['details'][index])]
+
+    _schema1 = t.StructType([
+        t.StructField('station', t.StringType(), True),
+        t.StructField(index, index_type, True)
+    ])
+    ind_list = []
+    for i in configure.general_map.keys():
+        ind_list.append([
+            i,
+            configure.general_map[i]['details'][index]
+        ])
+    index_df = spark.createDataFrame(ind_list, schema=_schema1)
+
+    return index_df
+
+# 获取对应层级的设备id
+def get_id_by_tier(tier):
+    dev_id = []
+    for i in configure.station_list:
+        dev_id += configure.general_map[i]['id'][tier]['dev_id']
+    return dev_id
--- a/battery_health/data_repair.py
+++ b/battery_health/data_repair.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+from battery_health import data_load, configure, udf_collections
+import pyspark.sql.functions as F
+import pyspark
+import time
+import pyspark.sql.types as t
+import pandas as pd
+from dateutil.relativedelta import relativedelta
+from pyspark.sql import Window
+from battery_health.functions.data_verification import data_check
+import time
+import requests
+from battery_health.functions.statistic_data import statistic_data_prep, statistic_calculation
+from battery_health.functions.health_score import health_score_data_prep
+from battery_health import spark
+
+def sava_to_hive(date):
+    # create_risk_data(date=date)
+    # save_data = create_capacity_soh(date=date)
+
+    health_data = data_load.get_data_general('test.health_data_tmp', date=date)
+
+    pid_data = data_load.get_dev_pid(targets=['pid', 'dev_id', 'pid_name'],
+                                     tier=['BatteryCluster', 'BatteryUnit', 'Storage', 'Cell'])
+    hive_data = health_data.join(pid_data, on=['dev_id', 'pid_name'], how='left')\
+        .filter(F.col('station').isin(configure.station_list))
+    hive_data = hive_data.withColumn('dt', F.lit(date)).select(['pid', 'times', 'val', 'dt', 'station']).dropna()
+    # 清空要写入的点号当天0点的数据
+    return hive_data
+
+
+def empty_influxdb(date, pid_name_list, station=None):
+    struct_time = time.strptime(date, '%Y-%m-%d')
+    date_s = str(int(time.mktime(struct_time))*1000000000)
+    dev_pid = data_load.get_dev_pid(pid_name=pid_name_list, station_id_list='580828',
+                                    tier=['BatteryCluster', 'BatteryUnit', 'Storage', 'Cell'])
+    dp = dev_pid.toPandas()
+
+    for i in range(10):
+        print('round:', i)
+        dp_ = dp[i::10]
+        dp_list = dp_.pid.to_list()
+        #dp_list = ['11159999938769','11159999938775','11159999938774','11159999952235']
+        s = str(dp_list).replace('[', '%28pid%3D%27').replace(''',''','%27+or+pid%3D%27').replace("'", '')\
+                .replace(' ', '').replace(']', '%27%29+and+time%3D')+date_s
+        headers = {
+            'Accept': 'application/csv',
+        }
+        station_nmub_dict = {'xmei':'1114','hnc':'1115'}
+        if station is None:
+            response = requests.get(
+                # 'https://ts-bp1ue44w0814t8cx9.influxdata.tsdb.aliyuncs.com:8086/query?db=sgool&u=huafeng&p=Huafeng@2022&precision=ms&q=delete+from+data_meas_1114_1d+where+{}'.format(s),
+                'http://192.168.1.99:48088/query?db=sgool&u=yunhe&p=yunhe2022&precision=ms&q=delete+from+data_meas_1114_1d+where+{}'.format(s),
+                headers=headers,
+            )
+            print('round:', i, 'xmei delete finished')
+            response = requests.get(
+                # 'https://ts-bp1ue44w0814t8cx9.influxdata.tsdb.aliyuncs.com:8086/query?db=sgool&u=huafeng&p=Huafeng@2022&precision=ms&q=delete+from+data_meas_1115_1d+where+{}'.format(s),
+                'http://192.168.1.99:48088/query?db=sgool&u=yunhe&p=yunhe2022&precision=ms&q=delete+from+data_meas_1115_1d+where+{}'.format(s),
+                headers=headers,
+            )
+            print('round:', i, 'hnc delete finished')
+        else:
+            response = requests.get(
+                # 'https://ts-bp1ue44w0814t8cx9.influxdata.tsdb.aliyuncs.com:8086/query?db=sgool&u=huafeng&p=Huafeng@2022&precision=ms&q=delete+from+data_meas_{station_numb}_1d+where+{s}'
+                    # .format( s=s, station_numb=station_nmub_dict[station]),
+                'http://192.168.1.99:48088/query?db=sgool&u=yunhe&p=yunhe2022&precision=ms&q=delete+from+data_meas_{station_numb}_1d+where+{s}'.format( s=s, station_numb=station_nmub_dict[station]),
+                headers=headers,
+            )
+            print('round:', i, station, 'delete finished')
+    print('x')
+
+
+def empty_table(table, station, date):
+    try:
+        spark.sql("alter table {table} drop partition(station='{station}', dt='{date}')"
+                  .format(table=table, station = station, date = date))
+    except pyspark.sql.utils.AnalysisException:
+        print('partition(station={station}, dt={date}) does not exist'.format(station = station, date = date))
+
+
+def statistic_repair(date, station, empty=True):
+    # 重算详情页中间表数据
+    empty_table(table=configure.statistic_mid_table, station=station, date=date)
+    mid_data = statistic_data_prep.data_prep_station(date).filter(F.col('station')==station)
+    mid_data.write.format('hive').insertInto(configure.statistic_mid_table, overwrite=True)
+    # 重算详情页数据
+    empty_table(table='test.health_data_tmp', station=station, date=date)
+    data = statistic_calculation.get_statistic(date=date, cycles=1).filter(F.col('station')==station)
+    data.write.format('hive').insertInto('test.health_data_tmp', overwrite=True)
+    # 数据汇总，数据存在cx库下的app_influxdb_data表中
+    empty_table(table='cx.app_influxdb_data', station=station, date=date)
+    hive_data = sava_to_hive(date).filter(F.col('station') == station)
+    hive_data.write.format('hive').insertInto('cx.app_influxdb_data', overwrite=True)
+
+    # 清空influxdb内数据
+    if empty:
+        pid_name_list = ['ChargeEndSOC', 'DischargeEndSOC', 'ChargeEndMaxVoltDiff', 'DischargeEndMaxVoltDiff',
+                         'ChargeEndVoltSTD', 'DischargeEndVoltSTD', 'ChargeEndVoltageDiff', 'DischargeEndVoltageDiff',
+                         'ChargeEndVoltageDeviation', 'DischargeEndVoltageDeviation']
+        empty_influxdb(date, pid_name_list, station)
+
+if __name__ == '__main__':
+
+    print('x')
--- a/battery_health/functions/__init__.py
+++ b/battery_health/functions/__init__.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
\ No newline at end of file
--- a/battery_health/functions/__pycache__/__init__.cpython-36.pyc
+++ b/battery_health/functions/__pycache__/__init__.cpython-36.pyc
--- a/battery_health/functions/data_verification/__init__.py
+++ b/battery_health/functions/data_verification/__init__.py
--- a/battery_health/functions/data_verification/__pycache__/__init__.cpython-36.pyc
+++ b/battery_health/functions/data_verification/__pycache__/__init__.cpython-36.pyc
--- a/battery_health/functions/data_verification/__pycache__/data_check.cpython-36.pyc
+++ b/battery_health/functions/data_verification/__pycache__/data_check.cpython-36.pyc
--- a/battery_health/functions/data_verification/adjustment.py
+++ b/battery_health/functions/data_verification/adjustment.py
+from battery_health import main_process
+def t():
+
+    main_process.test()
\ No newline at end of file
--- a/battery_health/functions/data_verification/data_check.py
+++ b/battery_health/functions/data_verification/data_check.py
+#!/usr/bin/env python
+# encoding: utf-8
+from battery_health import configure, data_load
+import pyspark.sql.functions as F
+
+class DataVolumeError(Exception):
+    def __init__(self,merge_data):
+        self.merge_data = merge_data
+        pass
+
+
+def check_dev_data(data):
+    # 检测数据量，以SOC为键
+    data = data.select(['soc', 'station']).drop_duplicates()
+    data = data.toPandas()
+    for i in data.station.unique():
+        if len(data[data['station']==i])<len(configure.dev_id_list):
+            print('station:', i, 'Insufficient Data Volume!')
+            # raise DataVolumeError('Insufficient Data Volume!')
+
+
+def check_state_data(state):
+    # 检测state数据表的数据量，以state为键
+    state = state.select(['state']).drop_duplicates()
+    st = state.toPandas()
+    if len(st)<1:
+        return 1
+    else: return 0
+
+
+def check_cell_number(data):
+    # 单体数据量检测
+    data = data.toPandas()
+    for i in data.station.unique():
+        d = data[data['station']==i]
+        actual = len(d.cell_id.unique())
+        installed = configure.general_map[i]['details']['cell_total']
+        if actual != installed:
+            print('station:',i)
+            print('An abnormal number of cells was detected:')
+            print(installed,'cells installed, but',actual,'cells have data!')
+
+
+def check_cell_data(data, col_name):
+    # 电压、温度数据校验
+    lower_range = data_load.get_index('{}_lower'.format(col_name))
+    upper_range = data_load.get_index('{}_upper'.format(col_name))
+    data_range = lower_range.join(F.broadcast(upper_range), on=['station'], how='left')
+    data = data.join(F.broadcast(data_range), on=['station'], how='left').filter(
+        (F.col(col_name) > F.col('{}_lower'.format(col_name))) & (F.col(col_name) < F.col('{}_upper'.format(col_name))))\
+        .drop('{}_lower'.format(col_name)).drop('{}_upper'.format(col_name))
+    return data
+
+def check_soc_data(data, col_name):
+    # 电压、温度数据校验
+    charge_range = data_load.get_index('ChargeEndSOC')
+    discharge_range = data_load.get_index('DischargeEndSOC')
+    data_range = charge_range.join(F.broadcast(discharge_range), on=['station'], how='left')
+    data = data.join(F.broadcast(data_range), on=['station'], how='left')
+    charge_data = data.filter((F.col('state')=='charge') & (F.col(col_name)>F.col('ChargeEndSOC')))
+    discharge_data = data.filter((F.col('state')=='discharge') & (F.col(col_name)<F.col('DischargeEndSOC')))
+    merge_data = charge_data.unionByName(discharge_data).drop('ChargeEndSOC').drop('DischargeEndSOC')
+    return merge_data
\ No newline at end of file
--- a/battery_health/functions/derive/__init__.py
+++ b/battery_health/functions/derive/__init__.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+'''
+计算SOH相关指标，使用线性回归进行预测，并计算电站的一些衍生指标。
+计算目标：[实际SOH、理论SOH、收益、单元日里程 、单元总里程、电池效率、充电保持率、放电保持率、剩余寿命、剩余充电次数、以循环次数、
+        平均满充电量、平均满放电量。]
+'''
\ No newline at end of file
--- a/battery_health/functions/derive/__pycache__/__init__.cpython-36.pyc
+++ b/battery_health/functions/derive/__pycache__/__init__.cpython-36.pyc
--- a/battery_health/functions/derive/__pycache__/data_summary.cpython-36.pyc
+++ b/battery_health/functions/derive/__pycache__/data_summary.cpython-36.pyc
--- a/battery_health/functions/derive/__pycache__/derive.cpython-36.pyc
+++ b/battery_health/functions/derive/__pycache__/derive.cpython-36.pyc
--- a/battery_health/functions/derive/data_summary.py
+++ b/battery_health/functions/derive/data_summary.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import time
+import pandas as pd
+from battery_health.functions.derive import derive
+from battery_health import spark
+
+
+def calculate_derive_index(date):
+    # 将设备在不同充放电阶段的数据（电量和状态）进行汇总
+    cap_data = derive.cache_capacity_soh(date=date)
+
+    # 电站不同工况对放电数据进行过滤
+    capacity_soh = derive.filter_soc_by_station(cap_data)
+    capacity_soh_pdf = capacity_soh.toPandas()
+
+    # 容量与SoH计算口径以放电模式为基准
+    dis_capacity_soh = capacity_soh_pdf[capacity_soh_pdf.state == 'discharge']
+    ch_capacity_soh = capacity_soh_pdf[capacity_soh_pdf.state == 'charge']
+
+    dev_charge_cap = derive.calculate_capacity(capacity_soh_pdf=ch_capacity_soh, switch=True)
+    dev_discharge_cap = derive.calculate_capacity(capacity_soh_pdf=dis_capacity_soh, switch=True)
+
+    dev_soh_ = derive.calculate_soh(dev_capacity=dev_discharge_cap, date=date)  # 计算设备的SoH
+    print("----------------dev_soh_ show:",dev_soh_)
+
+    ch_soh = derive.calculate_soh(dev_capacity=dev_charge_cap, date=date)
+    print("----------------ch_soh show:",ch_soh)
+
+
+    # 计算能量单元充放电保持率
+    enu_ch_ren = derive.calculate_retention(data=ch_capacity_soh)
+    enu_dis_ren = derive.calculate_retention(data=dis_capacity_soh)
+
+    # 计算电池单元里程
+    batu_mil = derive.calculate_batu_mileage(dev_discharge_data=dev_discharge_cap, date=date)
+    batu_mil['pid_name'] = 'BatteryMileageAmount'
+    daily_mil = derive.calculate_daily_mileage(dev_discharge_data=dev_discharge_cap, date=date)
+    daily_mil['pid_name'] = 'BatteryMileageDay'
+    daily_mil = daily_mil[daily_mil['dt'] == date]
+
+    # 计算已用循环次数、剩余循环次数以及理论SoH
+    charge_energy, discharge_energy = derive.get_energy_data()
+    used_cycles, remainder_cycles, theory_soh = derive.calculate_remainder_cycles(discharge_energy, soh_data=dev_soh_,
+                                                                                  date=date)
+
+    # 能量单元效率计算
+    enu_efficiency = derive.calculate_efficiency(charge_soh=ch_soh, discharge_soh=dev_soh_)
+
+    # 满充满放电量汇算
+    enu_full_charge = derive.get_full_energy(data=enu_efficiency, state='charge')
+    enu_full_discharge = derive.get_full_energy(data=enu_efficiency, state='discharge')
+    enu_full_charge = enu_full_charge[['dev_id', 'charge_full', 'dt', 'station']]
+    enu_full_discharge = enu_full_discharge[['dev_id', 'discharge_full', 'dt', 'station']]
+
+    # 预测SoH
+    forecast_soh, remainder_days = derive.calculate_forecast_soh(data=dev_soh_, col='discharge_soh', date=date)
+
+    # 计算预测收益
+    forecast_profit = derive.calculate_forecast_profit(charge_energy_df=charge_energy,
+                                                       discharge_energy_df=discharge_energy,
+                                                       forecast_soh_df=forecast_soh)
+
+    # 对不同指标进行列名替换并增加pid_name
+    cols = ['dev_id', 'pid_name', 'dt', 'val', 'station']
+    dev_soh_.rename(columns={'discharge_soh': 'val'}, inplace=True)  # 驾驶舱中的实际SoH历史值
+    forecast_soh.rename(columns={'forecast_soh': 'val'}, inplace=True)  # 驾驶舱中的实际SoH预测值
+    real_soh = pd.concat([dev_soh_, forecast_soh], axis=0)
+    real_soh['pid_name'] = 'RealSOH'
+
+    theory_soh.rename(columns={'theory_soh': 'val'}, inplace=True)  # 驾驶舱中的理论SoH（历史值和预测值）
+    theory_soh['pid_name'] = 'TheorySOH'
+
+    used_cycles.rename(columns={'used_cycles': 'val'}, inplace=True)  # 驾驶舱中的已用循环次数
+    used_cycles['pid_name'] = 'UsedRecycleTimes'
+
+    remainder_cycles.rename(columns={'remainder_cycles': 'val'}, inplace=True)  # 驾驶舱中的剩余循环次数
+    remainder_cycles['pid_name'] = 'RemainingRecycleTimes'
+
+    remainder_days.rename(columns={'remainder_days': 'val'}, inplace=True)  # 驾驶舱中的剩余可用天数
+    remainder_days['pid_name'] = 'RemainingRecycleDays'
+
+    forecast_profit.rename(columns={'profit': 'val'}, inplace=True)  # 驾驶舱中的预测收益（月度）
+    forecast_profit['pid_name'] = 'ForecastProfitMonth'
+
+    enu_efficiency.rename(columns={'efficiency': 'val'}, inplace=True)  # 驾驶舱中的能量单元效率
+    enu_efficiency['pid_name'] = 'BatteryEfficiencyDay'
+
+    enu_full_charge.rename(columns={'charge_full': 'val'}, inplace=True)  # 电池运维中的满充电量曲线
+    enu_full_charge['pid_name'] = 'AvgFullCharge'
+
+    enu_full_discharge.rename(columns={'discharge_full': 'val'}, inplace=True)  # 电池运维中的满放电量曲线
+    enu_full_discharge['pid_name'] = 'AvgFullDischarge'
+
+    enu_ch_ren.rename(columns={'retention': 'val'}, inplace=True)  # 效率分析中的充电保持率
+    enu_ch_ren['pid_name'] = 'ChargeEnergyRetention'
+    enu_dis_ren.rename(columns={'retention': 'val'}, inplace=True)  # 效率分析中的放电保持率
+    enu_dis_ren['pid_name'] = 'DischargeEnergyRetention'
+
+    # 汇总指标
+    derive_index = pd.concat([real_soh[cols], theory_soh[cols], used_cycles[cols], remainder_cycles[cols],
+                              remainder_days[cols], forecast_profit[cols], batu_mil[cols], daily_mil[cols],
+                              enu_efficiency[cols], enu_full_charge[cols], enu_full_discharge[cols],
+                              enu_ch_ren[cols], enu_dis_ren[cols]
+                              ], axis=0)
+    derive_index.rename(columns={'dt': 'times'}, inplace=True)
+    derive_index['times'] = derive_index.times.apply(
+        lambda x: int(time.mktime(time.strptime(str(x)[:10], "%Y-%m-%d"))) * 1000)
+    derive_index['val'] = derive_index.val.astype('float')
+    derive_index['val'] = round(derive_index['val'], 3)
+    derive_index['dt'] = date
+    derive_index_spark = spark.createDataFrame(derive_index)
+    derive_index_spark = derive_index_spark.select(['dev_id', 'pid_name', 'times', 'val', 'dt', 'station'])
+    return derive_index_spark
+
+
+if __name__ == '__main__':
+    dt = '2024-10-15'
+    data = calculate_derive_index(date=dt)
+    data.head()
+
--- a/battery_health/functions/derive/derive.py
+++ b/battery_health/functions/derive/derive.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+
+from battery_health import spark, configure
+
+import pyspark.sql.functions as F
+from pyspark.sql import Window
+
+import math
+import pandas as pd
+import numpy as np
+from dateutil.relativedelta import relativedelta
+from sklearn import linear_model
+
+STATION_LIST = list(configure.general_map.keys())
+# enu_id_list = [configure.general_map[s]['id']['enu']['dev_id'][0] for s in STATION_LIST]
+
+enu_id_list = list()
+for s in STATION_LIST:
+    enu_id_list += configure.general_map[s]['id']['enu']['dev_id']
+
+
+def cache_capacity_soh(date):
+    """
+    抽取电站充放电状态以及电量数据并进行合并，将合并后的数据进行缓存、
+
+    Args:
+        date: string
+
+    Returns:
+        capacity_soh: Spark DataFrame []
+    """
+    state_df = spark.sql("select * from {table_name} ".format(table_name=configure.state_table))
+    capacity_df = spark.sql("select * from {table_name} ".format(table_name=configure.cap_table))
+    state_df = state_df.filter(F.col('dt') <= date)
+    capacity_df = capacity_df.filter(F.col('dt') <= date)
+    # 针对上海数据中心，关口开关和1#储能单元相关点号绑定问题，对dev_id进行替换
+    # state_df = state_df.replace(['21241'], ['21202'], 'dev_id')
+    # capacity_df = capacity_df.replace(['21241'], ['21202'], 'dev_id')
+    capacity_soh = capacity_df.join(state_df, on=['dev_id', 'state', 'start_time', 'dt', 'station'], how='left')
+    capacity_soh = capacity_soh.cache()
+    capacity_soh.count()
+    # 只选取除能量单元以及电池单元以外的数据
+    dev_id_list = []
+    for station in STATION_LIST:
+        # 先添加能量单元dev_id
+        dev_id_list = dev_id_list + configure.general_map[station]['id']['enu']['dev_id']
+        # 接着添加电池单元dev_id
+        dev_id_list = dev_id_list + configure.general_map[station]['id']['batu']['dev_id']
+    capacity_soh = capacity_soh.filter(F.col('dev_id').isin(dev_id_list))
+    return capacity_soh
+
+
+def filter_soc_by_station(capacity_soh):
+    """
+    根据不同电站的实际工况，对电站的运行数据进行过滤，以便后续计算容量、SoH
+
+    Args:
+        capacity_soh: Spark DataFrame
+
+    Returns:
+        init_data: Spark DataFrame
+
+    """
+    # 对于不同的电站，获取相应的过滤标准并完成数据过滤
+    station_data_dict = dict()
+    for station in STATION_LIST:
+        filter_val = configure.general_map[station]['details']['soc_diff_filter']
+        condition = (F.col('soc_diff') >= filter_val) | (F.col('soc_diff') <= -filter_val)
+        station_data_dict[station] = capacity_soh.filter(F.col('station') == station).filter(condition)
+    # 将过滤后的数据进行横向拼接
+    init_data = station_data_dict[STATION_LIST[0]]
+    for station in STATION_LIST[1:]:
+        station_data = station_data_dict[station]
+        init_data = init_data.unionByName(station_data)
+    return init_data
+
+
+def process_capacity_outlier(dev_pdf, switch=False):
+    dev_pdf.sort_values('dt', inplace=True)
+
+    # 对于电量大于标称容量的数据进行校正
+    station = dev_pdf.station.iloc[0]
+    std_cap = configure.general_map[station]['details']['standard_capacity']
+    if switch:
+        dev_pdf['basic_capacity'] = dev_pdf.basic_capacity.apply(lambda x: x if x <= std_cap else std_cap)
+
+    # 剔除电站试运营阶段的数据
+    motion_time = configure.general_map[station]['details']['motion_time']
+    dev_pdf = dev_pdf[dev_pdf.dt >= motion_time]
+    dev_pdf.index = list(range(len(dev_pdf)))
+
+    start_stop_list = recognize_off_stream(data=dev_pdf)
+    fix_df = pd.DataFrame(data=None)
+    for i in list(range(len(start_stop_list)))[::2]:
+        start = start_stop_list[i]
+        end = start_stop_list[i + 1]
+        period_df = dev_pdf[(dev_pdf.dt >= start) & (dev_pdf.dt <= end)]
+        period_df = process_accident(data=period_df)  # 处理突变点
+        period_df = process_outlier(data=period_df)  # 处理异常值
+        period_df = smooth_capacity_curve(data=period_df)  # 曲线平滑
+        fix_df = pd.concat([fix_df, period_df], axis=0)
+    fix_df.index = list(range(len(fix_df)))
+    return fix_df
+
+
+def recognize_off_stream(data):
+    # 识别电站停运情况
+    dt_list = data.dt.to_list()
+    dt_list = [pd.to_datetime(x) for x in dt_list]
+    dt_diff_list = [(j - i).days for i, j in zip(dt_list[:-1], dt_list[1:])]
+    period_list = list()
+    period_list.append(0)
+    for i in list(range(len(dt_diff_list))):
+        if dt_diff_list[i] >= 50:  # 若两个日期的差值超过50天，则会被认定为停运
+            period_list.append(i)
+            period_list.append(i + 1)
+    period_list.append(len(dt_list) - 1)
+    start_stop_list = [data.dt.iloc[x] for x in period_list]
+    return start_stop_list
+
+
+def smooth_capacity_curve(data):
+    if len(data) >= 10:
+        need_smooth_data = data.basic_capacity.to_list()
+        kernel = [0.2] * 5
+        smoothed_data = list(np.convolve(need_smooth_data, kernel, mode='valid'))
+        smoothed_data = need_smooth_data[:2] + smoothed_data + need_smooth_data[-2:]
+        data['basic_capacity'] = smoothed_data
+    else:
+        data = data
+    return data
+
+
+def process_outlier(data):
+    """
+    基于
+    :param data:
+    :return:
+    """
+    for i in range(len(data)):
+        pre_cap = data.basic_capacity.iloc[max(0, i - 10):(i - 1)].mean()
+        lst_cap = data.basic_capacity.iloc[(i + 1):min(len(data) - 1, i + 10)].mean()
+        cur_cap = data.basic_capacity.iloc[i]
+        if (abs(cur_cap - pre_cap) / pre_cap > 0.05) | (abs(cur_cap - lst_cap) / lst_cap > 0.05):
+            index = list(range(max(0, i - 10), min(len(data) - 1, i + 10)))
+            data.loc[i, 'basic_capacity'] = data.basic_capacity.iloc[index].median()
+    return data
+
+
+def process_outlier_by_3_sigma(data):
+    std = np.std(data)
+    avg = np.mean(data)
+    upper_val = avg + 3 * std
+    lower_val = avg - 3 * std
+    for i in list(range(len(data))):
+        if (data[i] >= upper_val) | (data[i] <= lower_val):
+            data[i] = np.median(data[max(0, i - 10):min(i + 10, len(data) - 1)])
+    return data
+
+
+def process_accident(data):
+    """
+    处理容量曲线中突上突下的情况
+    """
+    data.index = list(range(len(data)))
+    cap_list = data.basic_capacity.to_list()
+    cap_diff = [j - i for i, j in zip(cap_list[:-1], cap_list[1:])]
+    outlier_index = list()
+    for i in list(range(len(cap_diff) - 1)):
+        cur_diff = cap_diff[i]
+        nxt_diff = cap_diff[i + 1]
+        std = np.std(cap_list[max(0, (i - 3)):(i + 4)])
+        avg = np.mean(cap_list[max(0, (i - 3)):(i + 4)])
+        diff_sum = abs(cur_diff + nxt_diff)
+        diff_prd = cur_diff * nxt_diff
+        if (diff_sum < 2 * std) & (abs(diff_prd) > 2 * avg) & (diff_prd < 0):
+            outlier_index.append(i)
+            analysis_index = [i+1-3, i+1-2, i+1-1, i+1+1, i+1+2, i+1+3]
+            analysis_index = [x for x in analysis_index if x > 0]
+            analysis_index = [x for x in analysis_index if x <= (len(data) - 1)]
+            data.loc[i+1, 'basic_capacity'] = data.basic_capacity.iloc[analysis_index].mean()
+    return data
+
+
+def get_daily_max_cap(data):
+    max_basic_capacity = data.basic_capacity.max()
+    max_cap_data = data[data.basic_capacity == max_basic_capacity]
+    return max_cap_data
+
+
+def get_soh(data):
+    std_cap = data.basic_capacity.max()
+    data['soh'] = data.basic_capacity / std_cap * 100
+    return data
+
+
+def calculate_capacity(capacity_soh_pdf, switch):
+    """
+    计算各个电站中各个设备（主要是电池单元以及能量单元）的容量以及SoH。
+
+    Args:
+        capacity_soh_pdf: Spark DataFrame []
+        switch: boolean
+
+    Returns:
+        dev_capacity_soh: dataframe
+                          'dev_id': string,
+                          'state': string,
+                          'start_time': string,
+                          'end_time': string,
+                          'dt': string,
+                          'station': string,
+                          'capacity': double,
+                          'soc_diff': double,
+                          'full_charge': int,
+                          'basic_capacity': double
+    """
+
+    # capacity_soh_pdf = pd.read_pickle('/home/emr-user/zhangwj/data/capacity_soh_pdf.pkl')
+    # 通过soc_diff放大当日放电量，得到初步的容量
+    capacity_soh_pdf['basic_capacity'] = abs(capacity_soh_pdf['capacity'] / capacity_soh_pdf['soc_diff'] * 98)
+    # 提取日内最大放电量作为当日基础容量
+    dev_capacity = capacity_soh_pdf.groupby(['dev_id', 'dt', 'station']).apply(lambda x: get_daily_max_cap(x))
+    dev_capacity.index = list(range(len(dev_capacity)))
+    # 对初步容量数据进行异常处理
+    # cols = ['dev_id']
+    # dev_capacity['basic_capacity'] = dev_capacity.groupby(cols).apply(process_capacity_outlier)
+    new_dev_capacity = pd.DataFrame(data=None)
+    for dev_id in dev_capacity.dev_id.unique():
+        input_data = dev_capacity[dev_capacity.dev_id == dev_id]
+        output_data = process_capacity_outlier(dev_pdf=input_data, switch=switch)
+        new_dev_capacity = pd.concat([new_dev_capacity, output_data])
+    new_dev_capacity.index = list(range(len(new_dev_capacity)))
+
+    return new_dev_capacity
+
+
+def calculate_soh(dev_capacity, date):
+    # 计算设备得SoH
+    dev_capacity = dev_capacity.groupby('dev_id').apply(get_soh)
+    cols = ['dev_id', 'soh', 'dt', 'station']
+    dev_soh = pd.DataFrame(data=None)
+
+    for dev_id in dev_capacity.dev_id.unique():
+        dev_data = dev_capacity[dev_capacity.dev_id == dev_id]
+        dev_data = dev_data[cols]
+        dev_data = generate_off_stream_data(dev_data=dev_data, col_name='soh')  # 生成电站停运期间的SoH数据
+        if dev_id in enu_id_list:
+            recent_soh = generate_recent_soh_data(dev_data=dev_data, cur_date=date)
+        else:
+            recent_soh = pd.DataFrame(data=None)
+        tmp_df = pd.concat([dev_data, recent_soh], axis=0)
+        tmp_df = fill_dt(data=tmp_df, col='soh')
+        dev_soh = pd.concat([dev_soh, tmp_df], axis=0)
+    return dev_soh
+
+
+def generate_off_stream_data(dev_data, col_name):
+    # 对于停运后启动的数据进行校正以及停运期间的数据进行模拟
+    start_stop_list = recognize_off_stream(data=dev_data)
+    for i in range(len(start_stop_list))[1::2]:
+        start = start_stop_list[i]
+        if start != start_stop_list[-1]:
+            end = start_stop_list[i + 1]
+            start_pre = str((pd.to_datetime(start) - relativedelta(days=30)).date())
+            end_lst = str((pd.to_datetime(end) + relativedelta(days=30)).date())
+            start_col = dev_data[(dev_data.dt >= start_pre) & (dev_data.dt <= start)][col_name].mean()
+            end_col = dev_data[(dev_data.dt >= end) & (dev_data.dt <= end_lst)][col_name].mean()
+            date_list = pd.date_range(start=start, end=end, freq='D')
+            date_list = [str(x)[:10] for x in date_list][1:-1]
+            lgt = len(date_list)
+            step = (-1) * (start_col - end_col) / lgt
+            if step >= 0:
+                soh_list = [end_col] * lgt
+            else:
+                soh_list = np.arange(start_col, end_col, step)
+                soh_list = soh_list[:lgt]
+            station = dev_data.station.iloc[0]
+            dev_id = dev_data.dev_id.iloc[0]
+            period_df = pd.DataFrame(data={'dev_id': [dev_id] * lgt, col_name: soh_list,
+                                           'dt': date_list, 'station': [station] * lgt})
+            dev_data = pd.concat([dev_data, period_df], axis=0)
+    dev_data.sort_values('dt', inplace=True)
+    dev_data.index = list(range(len(dev_data)))
+    return dev_data
+
+
+def generate_recent_soh_data(dev_data, cur_date):
+    lst_dt = dev_data.dt.max()
+    gap_days = (pd.to_datetime(cur_date) - pd.to_datetime(lst_dt)).days
+    if gap_days > 0:  # 需要对一部分数据进行模拟
+        station = dev_data.station.iloc[0]
+        date_list = pd.date_range(lst_dt, cur_date, freq='D')
+        date_list = [str(x)[:10] for x in date_list][1:]
+        lgt = len(date_list)
+        lst_soh = dev_data.soh.iloc[-1]
+        rate = configure.general_map[station]['details']['theory_decay_ration'] / 10000
+        end_soh = lst_soh + rate * gap_days
+        soh_list = np.arange(lst_soh, end_soh, rate)
+        soh_list = soh_list[:lgt]
+        dev_id = dev_data.dev_id.iloc[0]
+        recent_soh_df = pd.DataFrame(data={'dev_id': [dev_id] * lgt, 'soh': soh_list,
+                                           'dt': date_list, 'station': [station] * lgt})
+    else:
+        recent_soh_df = pd.DataFrame(data=None)
+    return recent_soh_df
+
+
+def calculate_batu_mileage(dev_discharge_data, date):
+    batu_id = list()
+    for station in STATION_LIST:
+        batu_id += configure.general_map[station]['id']['batu']['dev_id']
+    batu_discharge_data = dev_discharge_data[(dev_discharge_data.dev_id.isin(batu_id)) &
+                                             (dev_discharge_data.state == 'discharge')]
+    cols = ['dev_id', 'station']
+    batu_mileage = batu_discharge_data.groupby(cols).apply(lambda x: x.basic_capacity.sum()).reset_index()
+    batu_mileage.columns = cols + ['val']
+    batu_mileage['dt'] = date
+    return batu_mileage
+
+
+def calculate_daily_mileage(dev_discharge_data, date):
+    batu_id = list()
+    for station in STATION_LIST:
+        batu_id += configure.general_map[station]['id']['batu']['dev_id']
+    batu_discharge_data = dev_discharge_data[(dev_discharge_data.dev_id.isin(batu_id)) &
+                                             (dev_discharge_data.state == 'discharge')]
+    cols = ['dev_id', 'station', 'dt']
+    batu_mileage = batu_discharge_data.groupby(cols).apply(lambda x: x.basic_capacity.sum()).reset_index()
+    batu_mileage.columns = cols + ['val']
+    batu_mileage = batu_mileage[batu_mileage['dt'] == date]
+    return batu_mileage
+
+
+def calculate_efficiency(charge_soh, discharge_soh):
+    """
+    传入能量单元的充电/放电，并对其进行数据处理
+
+    Args:
+        charge_soh:
+        discharge_soh:
+
+    Returns:
+
+    """
+    charge_soh.rename(columns={'soh': 'charge_soh'}, inplace=True)
+    discharge_soh.rename(columns={'soh': 'discharge_soh'}, inplace=True)
+    soh_df = pd.merge(charge_soh, discharge_soh, on=['dev_id', 'dt', 'station'], how='left')
+    efficiency_df = pd.DataFrame(data=None)
+    for dev_id in enu_id_list:
+        enu_soh_df = soh_df[soh_df.dev_id == dev_id]
+        enu_soh_df.index = list(range(len(enu_soh_df)))
+        for i in range(len(enu_soh_df)):
+            if math.isnan(enu_soh_df.discharge_soh.iloc[i]):
+                enu_soh_df.loc[i, 'discharge_soh'] = np.nanmean(enu_soh_df.discharge_soh.iloc[(max(0, i - 5)):(i + 5)])
+        efficiency_df = pd.concat([efficiency_df, enu_soh_df])
+        efficiency_df['efficiency'] = efficiency_df.discharge_soh / efficiency_df.charge_soh
+        efficiency_df['efficiency'] = efficiency_df.efficiency.apply(lambda x: x if x >= 0.92 else 0.92)
+        efficiency_df['efficiency'] = efficiency_df.efficiency.apply(lambda x: x if x <= 0.9999 else 0.9999)
+        efficiency_df['efficiency'] = efficiency_df.efficiency * 100
+    return efficiency_df
+
+
+def get_full_energy(data, state):
+    col_name = state + '_soh'
+    new_col = state + '_full'
+    station_list = list()
+    capacity_list = list()
+    for station in STATION_LIST:
+        station_list.append(station)
+        capacity_list.append(configure.general_map[station]['details']['standard_capacity'])
+    cap_df = pd.DataFrame(data={'station': station_list, 'capacity': capacity_list})
+    data = pd.merge(data, cap_df, on='station', how='left')
+    data[new_col] = data[col_name] * data.capacity / 100
+    data.drop(['capacity'], axis=1, inplace=True)
+    return data
+
+
+def calculate_theory_soh(used_cycles_df, rate):
+    """
+    理论SoH的计算，根据技术规格书给出的参数以及当下已循环次数估算理论SoH，并通过对循环次数的预测，进而对理论SoH进行预测。
+
+    Args:
+        used_cycles_df: dataframe ['dev_id', 'used_cycles', 'dt', 'station']
+        rate: float
+
+    Returns:
+
+    """
+    station = used_cycles_df.station.iloc[0]
+    theory_info = configure.general_map[station]['details']['theory_info']
+    theory_cycles = theory_info['theory_cycles']
+    theory_stop_soh = theory_info['theory_stop_soh']
+    used_cycles_df['theory_soh'] = used_cycles_df.used_cycles.apply(lambda x: 100 - x / theory_cycles * theory_stop_soh)
+    # 根据斜率以及给定的信息对未来进行预测
+    cur_cycles = used_cycles_df.used_cycles.iloc[-1]
+
+    forecast_cycles = np.arange(cur_cycles, theory_cycles, max(1, abs(rate)))  # 按照每日1.5次循环进行计算
+    # cycles_daily = cur_cycles / (pd.to_datetime(used_cycles_df.dt.max()) - pd.to_datetime(used_cycles_df.dt.min())).days
+    # forecast_cycles = np.arange(cur_cycles, theory_cycles, cycles_daily)
+
+    start_date = pd.to_datetime(used_cycles_df.dt.max()) + relativedelta(days=1)
+    end_date = start_date + relativedelta(days=len(forecast_cycles))
+    date_list = pd.date_range(start=start_date, end=end_date, freq='D')
+    date_list = [str(x) for x in date_list]
+    dev_id = used_cycles_df.dev_id.iloc[0]
+    date_list = date_list[:len(forecast_cycles)]
+    lgt = len(date_list)
+    forecast_theory_soh = pd.DataFrame(data={'dev_id': [dev_id] * lgt, 'forecast_cycles': forecast_cycles,
+                                             'dt': date_list, 'station': [station] * lgt})
+    forecast_theory_soh['theory_soh'] = forecast_theory_soh.forecast_cycles.apply(
+        lambda x: 100 - x / theory_cycles * theory_stop_soh)
+    forecast_theory_soh['dt'] = forecast_theory_soh.dt.astype('str')
+    forecast_theory_soh['dt'] = forecast_theory_soh.dt.apply(lambda x: x[:10])
+    cols = ['dev_id', 'theory_soh', 'dt', 'station']
+    theory_soh = pd.concat([used_cycles_df[cols], forecast_theory_soh[cols]], axis=0)
+    theory_soh.index = list(range(len(theory_soh)))
+    stop_soh = configure.general_map[station]['details']['stop_soh']
+    theory_soh = theory_soh[theory_soh.theory_soh >= stop_soh]
+    all_date_list = pd.date_range(theory_soh.dt.min(), theory_soh.dt.max(), freq='D')
+    date_df = pd.DataFrame(data={'dt': all_date_list})
+    date_df['dt'] = date_df.dt.astype('str')
+    theory_soh = pd.merge(date_df, theory_soh, on='dt', how='left')
+    theory_soh.ffill(inplace=True)
+    return theory_soh
+
+
+def build_linear_model(train_input, train_output):
+    model = linear_model.LinearRegression()
+    model.fit(train_input, train_output)
+    return model
+
+
+def calculate_forecast_soh(data, col, date):
+    forecast_soh = pd.DataFrame(data=None)
+    remainder_days = pd.DataFrame(data=None)
+    for enu_id in enu_id_list:
+        enu_data = data[data.dev_id == enu_id]
+        enu_data.sort_values('dt', inplace=True)
+        enu_data.index = list(range(len(enu_data)))
+        station = enu_data.station.iloc[0]
+        stop_soh = configure.general_map[station]['details']['stop_soh']
+        time_gap = configure.general_map[station]['details']['forecast_time_gap']
+        base_dt = enu_data.dt.iloc[0]
+        enu_data['x_index'] = enu_data.dt.apply(lambda x: (pd.to_datetime(x) - pd.to_datetime(base_dt)).days + 1)
+        train_data = np.array(enu_data.x_index.to_list()).reshape([-1, 1])[-time_gap:]
+        train_label = np.array(enu_data[col].to_list()[-time_gap:])
+        model = build_linear_model(train_input=train_data, train_output=train_label)
+        rate = -0.015 if model.coef_ < -0.025 else model.coef_
+        rate = -0.01 if rate >= -0.01 else rate
+        lst_val = enu_data[col].iloc[-1]
+        forecast_val = np.arange(lst_val, stop_soh, rate)
+        # 生成相应的日期序列
+        dev_id = enu_data.dev_id.iloc[0]
+        last_day = str((pd.to_datetime(enu_data.dt.max()) + relativedelta(days=1)).date())
+        end_day = str((pd.to_datetime(last_day) + relativedelta(days=len(forecast_val) - 1)).date())
+        day_list = list(pd.date_range(start=last_day, end=end_day))
+        forecast_soh_df = pd.DataFrame(data={'dev_id': [dev_id] * len(day_list), 'dt': day_list,
+                                             'forecast_soh': list(forecast_val), 'station': [station] * len(day_list)})
+        forecast_soh_df['dt'] = forecast_soh_df.dt.astype('str')
+        forecast_soh = pd.concat([forecast_soh, forecast_soh_df], axis=0)
+        enu_remainder_days = pd.DataFrame(data={'dev_id': [dev_id], 'remainder_days': [len(day_list)],
+                                                'dt': [date], 'station': [station]})
+        remainder_days = pd.concat([remainder_days, enu_remainder_days], axis=0)
+    return forecast_soh, remainder_days
+
+
+def calculate_used_cycles(date, soh_data):
+    """
+    基于能量单元的反向功能示数计算已用循环次数
+    """
+    discharge_pid = [configure.general_map[s]['details']['discharge_energy_pid'] for s in STATION_LIST]
+    discharge_pid = str(discharge_pid).replace('[', '(').replace(']', ')')
+    # 此处抽取数据并不一定能抽到所有能量单元对应的反向电能示数
+    discharge_sp = spark.sql("select pid, max(val) cost, dt, station from {table} where dt = '{date}' and pid "
+                             "in {pid_list} group by pid, dt, station".format(table=configure.data_table,
+                                                                              date=date, pid_list=discharge_pid))
+    discharge_df = discharge_sp.toPandas()
+    discharge_df['std_cap'] = discharge_df.station.apply(lambda s:
+                                                         configure.general_map[s]['details']['standard_capacity'])
+    discharge_df = pd.merge(discharge_df, soh_data, on=['dt', 'station'], how='left')
+    discharge_df['cycles'] = discharge_df.cost / (discharge_df.std_cap * (1 + discharge_df.soh / 100) / 2)
+    used_cycles = discharge_df[['dev_id', 'cycles', 'dt', 'station']]
+    return used_cycles
+
+
+def get_energy_data():
+    discharge_pid_list = [configure.general_map[s]['details']['discharge_energy_pid'] for s in STATION_LIST]
+    charge_pid_list = [configure.general_map[s]['details']['charge_energy_pid'] for s in STATION_LIST]
+    station_list = str(STATION_LIST).replace('[', '(').replace(']', ')')
+    pid_list = charge_pid_list + discharge_pid_list
+    pid_list = str(pid_list).replace('[', '(').replace(']', ')')
+    energy_data = spark.sql("select  pid, times, val, dt, station from {table} where station in {station_list} and"
+                            " pid in {pid_list}".format(table=configure.data_table, station_list=station_list,
+                                                        pid_list=pid_list))
+    window = Window.partitionBy(['station', 'dt', 'pid'])
+    energy_data = energy_data.withColumn('max_time', F.max('times').over(window))\
+        .filter(F.col('times') == F.col('max_time'))
+    energy_data_df = energy_data.toPandas()
+    charge_energy = energy_data_df[energy_data_df.pid.isin(charge_pid_list)]
+    discharge_energy = energy_data_df[energy_data_df.pid.isin(discharge_pid_list)]
+    return charge_energy, discharge_energy
+
+
+def get_dev_discharge(enu_discharge_df, soh_data):
+    dev_id_list = [configure.general_map[s]['id']['enu']['dev_id'][0] for s in STATION_LIST]
+    discharge_pid_list = [configure.general_map[s]['details']['discharge_energy_pid'] for s in STATION_LIST]
+    capacity_list = [configure.general_map[s]['details']['standard_capacity'] for s in STATION_LIST]
+    stop_soh = [configure.general_map[s]['details']['stop_soh'] for s in STATION_LIST]
+    dev_id_pid = pd.DataFrame(data={'dev_id': dev_id_list, 'pid': discharge_pid_list,
+                                    'cap': capacity_list, 'stop_soh': stop_soh})
+
+    enu_discharge_df = pd.merge(enu_discharge_df, dev_id_pid, on='pid', how='left')
+    soh_data = soh_data[soh_data.dev_id.isin(enu_id_list)]
+    discharge_energy_soh = pd.merge(enu_discharge_df, soh_data, on=['dev_id', 'dt', 'station'], how='left')
+    cols = ['dev_id', 'val', 'soh', 'cap', 'stop_soh', 'dt', 'station']
+    discharge_energy_soh = discharge_energy_soh[cols]
+    return discharge_energy_soh, dev_id_pid
+
+
+def calculate_remainder_cycles(enu_discharge_df, soh_data, date):
+    """
+    构建循环次数与SoH的数学模型，输入截止SoH获取最大循环次数，该值减去当前已循环次数便是剩余循环次数.
+
+    Args:
+        enu_discharge_df
+        soh_data
+        date
+    """
+    discharge_energy_soh, dev_id_pid = get_dev_discharge(enu_discharge_df=enu_discharge_df, soh_data=soh_data)
+    used_cycle_df = pd.DataFrame(data=None)
+    remainder_cycle_df = pd.DataFrame(data=None)
+    theory_soh_df = pd.DataFrame(data=None)
+    for dev_id in dev_id_pid.dev_id.unique():
+        enu_df = discharge_energy_soh[discharge_energy_soh.dev_id == dev_id]
+        print("----------------enu_df show:",enu_df)
+        station = enu_df.station.iloc[0]
+        enu_df.sort_values('dt', inplace=True)
+        enu_df.dropna(inplace=True)
+        try:
+            zero_dt = configure.general_map[station]['details']['energy_zero_date']
+            enu_df = enu_df[enu_df.dt >= zero_dt]
+        except KeyError:
+            print('There is no energy_zero_date in configure.py.')
+        enu_df.index = list(range(len(enu_df)))
+        init_val = enu_df.val.min()
+        enu_df['total_discharge'] = enu_df['val'] - init_val
+        enu_df['unit_cost'] = enu_df.apply(lambda x: x['cap'] * (100 + x['soh']) / 100 / 2, axis=1)
+        enu_df['used_cycles'] = enu_df['total_discharge'] / enu_df['unit_cost']
+        train_data = np.array(enu_df.used_cycles.to_list()).reshape([-1, 1])[-60:]
+        train_label = np.array(enu_df.soh.to_list())[-60:]
+        model = build_linear_model(train_input=train_data, train_output=train_label)
+        rate = model.coef_[0]
+        rate = -0.0125 if rate + 0.0125 > 0 else rate
+        lgt = len(np.arange(enu_df.soh.iloc[-1], enu_df.stop_soh.iloc[-1], rate))
+        tmp_remain_df = pd.DataFrame(data={'dev_id': [dev_id], 'remainder_cycles': [lgt],
+                                           'dt': [date], 'station': [station]})
+        remainder_cycle_df = pd.concat([remainder_cycle_df, tmp_remain_df], axis=0)
+        # tmp_used_df = pd.DataFrame(data={'dev_id': [dev_id], 'used_cycles': [enu_df.used_cycles.iloc[-1]],
+        #                                  'dt': [date], 'station': [station]})
+        tmp_used_df = enu_df[['dev_id', 'used_cycles', 'dt', 'station']]
+        used_cycle_df = pd.concat([used_cycle_df, tmp_used_df], axis=0)
+        # 计算理论SoH并进行预测
+        theory_soh = calculate_theory_soh(used_cycles_df=tmp_used_df, rate=rate)
+        theory_soh_df = pd.concat([theory_soh_df, theory_soh], axis=0)
+    return used_cycle_df, remainder_cycle_df, theory_soh_df
+
+
+def calculate_forecast_profit(charge_energy_df, discharge_energy_df, forecast_soh_df):
+    dev_id_list = [configure.general_map[s]['id']['enu']['dev_id'][0] for s in STATION_LIST]
+    charge_pid_list = [configure.general_map[s]['details']['charge_energy_pid'] for s in STATION_LIST]
+    discharge_pid_list = [configure.general_map[s]['details']['discharge_energy_pid'] for s in STATION_LIST]
+    dev_pid = pd.DataFrame(data={'dev_id': dev_id_list, 'c_pid': charge_pid_list, 'd_pid': discharge_pid_list})
+    charge_energy_df = pd.merge(charge_energy_df, dev_pid[['dev_id', 'c_pid']], left_on='pid',
+                                right_on='c_pid', how='left')
+    discharge_energy_df = pd.merge(discharge_energy_df, dev_pid[['dev_id', 'd_pid']], left_on='pid',
+                                   right_on='d_pid', how='left')
+    forecast_profit = pd.DataFrame(data=None)
+    for dev_id in dev_id_list:
+        enu_forecast_soh = forecast_soh_df[forecast_soh_df.dev_id == dev_id]
+        enu_charge = charge_energy_df[charge_energy_df.dev_id == dev_id]
+        enu_discharge = discharge_energy_df[discharge_energy_df.dev_id == dev_id]
+        station = enu_charge.station.iloc[0]
+        try:
+            zero_dt = configure.general_map[station]['details']['energy_zero_date']
+            enu_charge = enu_charge[enu_charge.dt >= zero_dt]
+            enu_discharge = enu_discharge[enu_discharge.dt >= zero_dt]
+        except KeyError:
+            print('There is no energy_zero_date in configure.py.')
+        enu_charge.sort_values('dt', inplace=True)
+        enu_charge.index = list(range(len(enu_charge)))
+        enu_discharge.sort_values('dt', inplace=True)
+        enu_discharge.index = list(range(len(enu_discharge)))
+        cols = ['dev_id', 'val', 'dt', 'station']
+        enu_charge = enu_charge[cols]
+        enu_discharge = enu_discharge[cols]
+        enu_charge.rename(columns={'val': 'c_val'}, inplace=True)
+        enu_discharge.rename(columns={'val': 'd_val'}, inplace=True)
+        energy_df = pd.merge(enu_charge, enu_discharge, on=['dev_id', 'dt', 'station'], how='inner')
+        c_val_list = energy_df.c_val.to_list()
+        d_val_list = energy_df.d_val.to_list()
+        day_charge = [c_val_list[0]] + [y - x for x, y in zip(c_val_list[:-1], c_val_list[1:])]
+        day_charge = process_outlier_by_3_sigma(data=day_charge)
+        energy_df['day_charge'] = day_charge
+        day_discharge = [d_val_list[0]] + [y - x for x, y in zip(d_val_list[:-1], d_val_list[1:])]
+        day_discharge = process_outlier_by_3_sigma(data=day_discharge)
+        energy_df['day_discharge'] = day_discharge
+        start_dt = energy_df.dt.min()
+        energy_df['dt_gap'] = energy_df.dt.apply(lambda x: (pd.to_datetime(x) - pd.to_datetime(start_dt)).days)
+        energy_df = energy_df[energy_df.day_discharge > 20]  # todo: 阈值待确定
+        time_gap = configure.general_map[station]['details']['forecast_time_gap']
+        train_data = np.array(energy_df.dt_gap.to_list()).reshape([-1, 1])[-time_gap:]
+        train_label = np.array(energy_df.day_charge.to_list())[-time_gap:]
+        model = build_linear_model(train_input=train_data, train_output=train_label)
+        rate = model.coef_[0]
+        # 首先计算每日充电量并对其进行预测，当每日充电量趋势是逐渐增加的，需要对趋势进行调整，
+        # 计算出当电站停运时单次循环可放出的电量并以此为基础得到每日充电量数据；当每日充电量
+        # 趋势是递减的，则按照该趋势进行递减，直至每日充电量接近于停运时单次循环的充电量。
+        start_val = np.mean(train_label[-int(time_gap / 6):])
+        min_val = configure.general_map[station]['details']['standard_capacity'] * configure.general_map[station]['details']['stop_soh'] / 100
+        lgt = len(enu_forecast_soh)
+        rate = rate if rate < 0 else -(start_val - min_val) / lgt
+        val_list = np.arange(start_val, min_val, rate)
+        if len(val_list) >= lgt:
+            val_list = val_list[:lgt]
+        else:
+            val_list = list(val_list) + [min_val] * (lgt - len(val_list))
+        enu_forecast_soh['forecast_val'] = val_list
+        enu_forecast_val = enu_forecast_soh[['dev_id', 'forecast_val', 'dt']]
+        enu_forecast_val['station'] = station
+        profit_per_kwh = configure.general_map[station]['details']['profit_per_kwh']
+        enu_forecast_val['profit'] = enu_forecast_val.forecast_val * profit_per_kwh
+        enu_forecast_val['year_month'] = enu_forecast_val.dt.apply(lambda x: x[:7])
+        enu_forecast_val = enu_forecast_val.groupby(['dev_id', 'station', 'year_month']).apply(lambda x: x.profit.sum()).reset_index()
+        enu_forecast_val.columns = ['dev_id', 'station', 'year_month', 'profit']
+        cur_month = enu_forecast_soh.dt.min()[:7]
+        actual_charge = energy_df[energy_df.dt >= cur_month + '-01']['day_charge'].sum()
+        actual_profit = actual_charge * profit_per_kwh
+        condition = enu_forecast_val.year_month == cur_month
+        enu_forecast_val.loc[condition, 'profit'] = actual_profit + enu_forecast_val[condition]['profit'][0]
+        enu_forecast_val.rename(columns={'year_month': 'dt'}, inplace=True)
+        enu_forecast_val['dt'] = enu_forecast_val.dt.apply(lambda x: x + '-01')
+        forecast_profit = pd.concat([forecast_profit, enu_forecast_val], axis=0)
+    return forecast_profit
+
+
+def calculate_retention(data):
+    """
+    计算能量单元充放电过程中的保持率。
+
+    Args:
+        data: dataframe
+
+    Returns:
+        retention_df: dataframe
+    """
+    station_list = data.station.unique()
+    retention_df = pd.DataFrame(data=None)
+    for st in station_list:
+        dev_id = configure.general_map[st]['id']['enu']['dev_id']
+        st_data = data[data.dev_id.isin(dev_id)]
+        std_cap = configure.general_map[st]['details']['standard_capacity']
+        dev_charge_ren = calculate_capacity(capacity_soh_pdf=st_data, switch=False)
+
+        dev_charge_ren.loc[dev_charge_ren.basic_capacity >= std_cap * 1.2, 'basic_capacity'] = std_cap * 1.2
+        # dev_charge_ren = dev_charge_ren[dev_charge_ren.basic_capacity <= std_cap * 1.2]
+
+        max_value = dev_charge_ren.basic_capacity.max()
+        dev_charge_ren['retention'] = dev_charge_ren.basic_capacity.apply(lambda x: x / max_value)
+        tmp_df = dev_charge_ren[['dev_id', 'state', 'basic_capacity', 'retention', 'dt', 'station']]
+        # 对日期进行填补
+        full_dt = fill_dt(data=tmp_df, col='retention')
+        full_dt['retention'] = full_dt['retention'].apply(lambda x: x * 100)
+        retention_df = pd.concat([retention_df, full_dt], axis=0)
+    return retention_df
+
+
+def fill_dt(data, col):
+    """
+    按照日期对空缺值进行填补
+
+    Args:
+        data: dataframe
+        col: string
+
+    Returns:
+
+    """
+    start = data.dt.min()
+    end = data.dt.max()
+    date_list = pd.date_range(start=start, end=end, freq='D')
+    date_df = pd.DataFrame(data={'dt': date_list})
+    date_df['dt'] = date_df.dt.astype('str')
+    full_dt = pd.merge(date_df, data, on=['dt'], how='left')
+    # full_dt.ffill(inplace=True)
+    # 基于一些插值方法对缺失日期的数据进行填补
+    min_dt = data.dt.min()
+    data['gap'] = data.dt.apply(lambda x: (pd.to_datetime(x) - pd.to_datetime(min_dt)).days)
+    full_dt['gap'] = full_dt.dt.apply(lambda x: (pd.to_datetime(x) - pd.to_datetime(min_dt)).days)
+    xp = data.gap.to_list()
+    yp = data[col].to_list()
+    x = full_dt[full_dt.dev_id.isnull()]['gap'].to_list()
+    y = np.interp(x, xp, yp)
+    full_dt.loc[full_dt.dev_id.isnull(), col] = y
+    full_dt.ffill(inplace=True)
+    full_dt = full_dt[['dev_id', col, 'dt', 'station']]
+    return full_dt
+
+
+if __name__ == '__main__':
+    dt = '2023-02-04'
+    # cap_data = cache_capacity_soh(date=dt)
+    # cap_data = None
+    # 电站不同工况对放电数据进行过滤
+    # capacity_soh = filter_soc_by_station(cap_data)
+    # capacity_soh_pdf = capacity_soh.toPandas()
+    capacity_soh_pdf = pd.read_pickle('/home/emr-user/zhangwj/data/capacity_soh_pdf.pkl')
+    capacity_soh_pdf = capacity_soh_pdf[capacity_soh_pdf['station'].isin(configure.station_list)]
+    # 容量与SoH计算口径以放电模式为基准
+    dis_capacity_soh = capacity_soh_pdf[capacity_soh_pdf.state == 'discharge']
+    ch_capacity_soh = capacity_soh_pdf[capacity_soh_pdf.state == 'charge']
+    dev_charge_cap = calculate_capacity(capacity_soh_pdf=ch_capacity_soh, switch=True)
+    dev_discharge_cap = calculate_capacity(capacity_soh_pdf=dis_capacity_soh, switch=True)
+    dev_soh_ = calculate_soh(dev_capacity=dev_discharge_cap, date=dt)  # 计算设备的SoH
+    ch_soh = calculate_soh(dev_capacity=dev_charge_cap, date=dt)
+    batu_mil = calculate_batu_mileage(dev_discharge_data=dev_discharge_cap, date=dt)
+    daily_mil = calculate_daily_mileage(dev_discharge_data=dev_discharge_cap, date=dt)
+    # 计算剩余循环次数
+    # charge_energy, discharge_energy = get_energy_data()
+    charge_energy = pd.read_pickle('/home/emr-user/zhangwj/data/charge_energy.pkl')
+    discharge_energy = pd.read_pickle('/home/emr-user/zhangwj/data/discharge_energy.pkl')
+    used_cycles, remainder_cycles, theory_soh = calculate_remainder_cycles(discharge_energy, soh_data=dev_soh_, date=dt)
+
+    # 能量单元效率计算
+    enu_efficiency = calculate_efficiency(charge_soh=ch_soh, discharge_soh=dev_soh_)
+    # 满充满放电量汇算
+    enu_full_charge = get_full_energy(data=enu_efficiency, state='charge')
+    enu_full_discharge = get_full_energy(data=enu_efficiency, state='discharge')
+    # 理论SoH计算
+
+    # 计算能量保持率
+    # enu_ch_retention = calculate_retention(data=ch_capacity_soh)
+
+    # 预测SoH
+    forecast_soh, remainder_days = calculate_forecast_soh(data=dev_soh_, col='discharge_soh', date=dt)
+
+    # 计算预测收益
+    forecast_profit = calculate_forecast_profit(charge_energy_df=charge_energy, discharge_energy_df=discharge_energy,
+                                                forecast_soh_df=forecast_soh)
+
+    enu_full_charge.head()
+
--- a/battery_health/functions/health_score/__init__.py
+++ b/battery_health/functions/health_score/__init__.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+'''
+结合单体温度和电压的一致性、单体电阻、基础健康分、析锂检测四个大项得到最后的健康分。
+计算目标：[健康分]
+'''
\ No newline at end of file
--- a/battery_health/functions/health_score/__pycache__/__init__.cpython-36.pyc
+++ b/battery_health/functions/health_score/__pycache__/__init__.cpython-36.pyc
--- a/battery_health/functions/health_score/__pycache__/health_data_summary.cpython-36.pyc
+++ b/battery_health/functions/health_score/__pycache__/health_data_summary.cpython-36.pyc
--- a/battery_health/functions/health_score/__pycache__/health_score_data_prep.cpython-36.pyc
+++ b/battery_health/functions/health_score/__pycache__/health_score_data_prep.cpython-36.pyc
--- a/battery_health/functions/health_score/health_data_summary.py
+++ b/battery_health/functions/health_score/health_data_summary.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+from battery_health import data_load, configure, udf_collections
+import pyspark.sql.functions as F
+import pyspark.sql.types as t
+import pandas as pd
+from dateutil.relativedelta import relativedelta
+from pyspark.sql import Window
+from battery_health.functions.data_verification import data_check
+
+# 健康分汇总
+def score_summary(date):
+    cell_health_score = data_load.get_data_general(table=configure.cell_health_table, date=date).select(
+        ['cell_id', 'health_score', 'dt', 'station'])
+    batu_health_score = data_load.get_data_general(table=configure.batu_health_table, date=date).select(
+        ['batu_id', 'health_score', 'dt', 'station'])
+    cell_health_score = cell_health_score.withColumnRenamed('cell_id', 'dev_id').withColumnRenamed('health_score', 'val')
+    batu_health_score = batu_health_score.withColumnRenamed('batu_id', 'dev_id').withColumnRenamed('health_score', 'val')
+    health_data = cell_health_score.unionByName(batu_health_score)
+    health_data = health_data.withColumn('pid_name', F.lit('HealthScore')).withColumnRenamed('dt', 'times')
+    health_data = health_data.withColumn('times', F.unix_timestamp('times', 'yyyy-MM-dd'))\
+        .withColumn('times', F.col('times') * 1000)\
+        .withColumn('dt', F.lit(date))
+    health_data = health_data.select(['dev_id', 'pid_name', 'times', 'val', 'dt', 'station'])
+
+    return health_data
+
+# 电池单元每日健康分
+def calculate_batu_daily_health_score(date):
+    """
+    ①基于cell_base_data的 vol和temp数据
+     通过计算电池单元内的 温度极差、电压极差、电压标准差，并对此赋分
+    ②需要先完成衍生数据(derive)的计算
+     从decay_data表获取并计算七日平均Real_soh数据，
+    ③对这四个数据加权赋分
+    """
+    # 数据获取，获取soc跨度最大的那一次记录的末尾数据
+    data = data_load.get_cell_base(targets=['batu_id', 'cell_id', 'vol', 'temp', 'times', 'soc', 'state', 'dt',
+                                            'station'], date=date)
+    state_window = Window.partitionBy(['batu_id', 'state', 'dt', 'station'])
+    batu_window = Window.partitionBy(['batu_id', 'dt', 'station'])
+    # data = data.withColumn('soc_diff', F.max('soc').over(state_window) - F.min('soc').over(state_window))\
+    #     .withColumn('soc_max', F.max('soc_diff').over(batu_window)).filter(F.col('soc_max')==F.col('soc_diff'))
+    # data = data.withColumn('max_times', F.max('times').over(batu_window)).filter(F.col('max_times')==F.col('times'))
+
+    # 抽取电站截止电压信息
+    charge_vol_df = data_load.get_index('charge_stop_vol')
+    charge_vol_df = charge_vol_df.withColumn('state', F.lit('charge')).withColumnRenamed('charge_stop_vol', 'stop_vol')
+    discharge_vol_df = data_load.get_index('discharge_stop_vol')
+    discharge_vol_df = discharge_vol_df.withColumn('state', F.lit('discharge'))\
+        .withColumnRenamed('discharge_stop_vol', 'stop_vol')
+    stop_vol_df = charge_vol_df.unionByName(discharge_vol_df)
+
+    # 数据检查，过滤异常数据
+    data = data_check.check_cell_data(data=data, col_name='vol')
+    data = data_check.check_cell_data(data=data, col_name='temp')
+
+    condition_df = data.withColumn('end_vol', F.when(F.col('state') == 'charge', F.max('vol').over(state_window))
+                                   .otherwise(F.min('vol').over(state_window)))\
+        .withColumn('end_soc', F.when(F.col('state') == 'charge', F.max('soc').over(state_window))
+                    .otherwise(F.min('soc').over(state_window)))\
+        .withColumn('end_dod', F.max('soc').over(state_window) - F.min('soc').over(state_window))\
+        .select(['batu_id', 'state', 'end_vol', 'end_soc', 'end_dod', 'dt', 'station']).drop_duplicates()\
+        .filter(F.col('end_dod') >= 50)
+
+    condition_df = condition_df.join(stop_vol_df, on=['station', 'state'], how='left')\
+        .withColumn('cond_1', F.when(F.abs(F.col('stop_vol') - F.col('end_vol')) <= 0.1, 1).otherwise(0))\
+        .withColumn('cond_2', F.when((F.col('end_soc') >= 95) & (F.col('state') == 'charge'), 1)
+                    .otherwise(F.when((F.col('end_soc') <= 5) & (F.col('state') == 'discharge'), 1).otherwise(0)))
+
+    batu_id = data_load.get_id_by_tier(tier='batu')
+    state_df = data_load.get_state(date=date, targets=['dev_id', 'state', 'end_time', 'soc_diff', 'dt', 'station'],
+                                   dev_id=batu_id)
+    state_df = state_df.withColumnRenamed('dev_id', 'batu_id')
+    data = data.join(state_df, on=['batu_id', 'state', 'dt', 'station'], how='left')\
+        .filter(F.col('times') == F.col('end_time'))
+
+    # 根据设定阈值进行过滤
+    soc_threshold = data_load.get_index('soc_diff_filter')
+    data = data.join(soc_threshold, on='station', how='left')\
+        .filter(F.abs(F.col('soc_diff')) >= F.col('soc_diff_filter'))
+
+    # 计算电压极差、温度极差、电压标准差
+    cal_window = Window.partitionBy(['batu_id', 'state', 'end_time', 'dt', 'station'])  # max_times字段用来区分不同过程
+    data = data.withColumn('vol_range', F.max('vol').over(cal_window) - F.min('vol').over(cal_window))\
+        .withColumn('temp_range', F.max('temp').over(cal_window) - F.min('temp').over(cal_window)) \
+        .withColumn('vol_std', F.stddev('vol').over(cal_window)) \
+        .select(['batu_id', 'state', 'vol_range', 'temp_range', 'vol_std', 'dt', 'station']).drop_duplicates()
+
+    # 增加电压极差维度得分
+    batu_health = data \
+        .withColumn('vol_score', F.when(F.col('vol_range') < 0.1, 95 + 50 * (0.1 - F.col('vol_range'))).otherwise(
+        F.when(F.col('vol_range') <= 0.2, 95 - 100 * (F.col('vol_range') - 0.1)).otherwise(
+            F.when(F.col('vol_range') <= 0.4, 65 + 100 * (0.4 - F.col('vol_range'))).otherwise(
+                65 - 100 * (F.col('vol_range') - 0.4)))))
+    # 增加温度极差维度得分
+    batu_health = batu_health \
+        .withColumn('temp_score', F.when(F.col('temp_range') < 6, 95 + 5 / 6 * (6 - F.col('temp_range'))).otherwise(
+        F.when(F.col('temp_range') <= 8, 95 - 5 * (F.col('temp_range') - 6)).otherwise(
+            F.when(F.col('temp_range') <= 10, 65 + 10 * (10 - F.col('temp_range'))).otherwise(
+                65 - 10 * (F.col('temp_range') - 6)))))\
+        .withColumn('temp_score', F.when(F.col('temp_score') >= 0, F.col('temp_score')).otherwise(0))
+    # 增加电压标准差维度得分
+    batu_health = batu_health \
+        .withColumn('std_score', F.when(F.col('vol_std') < 0.02, 95 + 50 * (0.1 - F.col('vol_std'))).otherwise(
+        F.when(F.col('vol_std') <= 0.04, 95 - 500 * (F.col('vol_std') - 0.02)).otherwise(
+            F.when(F.col('vol_std') <= 0.06, 85 - 1000 * (F.col('vol_std') - 0.04)).otherwise(
+                65 - 1000 * (F.col('vol_std') - 0.06)))))
+
+    # 判断是否更新末端指标
+    record_data = data_load.get_data_general(table=configure.end_record_table, date='3000-01-01')
+    record_data = record_data.withColumnRenamed('dev_id', 'batu_id')
+    init_flag = record_data.rdd.isEmpty()
+
+    # 初始化末端一致性记录并根据条件判定是否对其进行更新
+    if init_flag:
+        mix_data = condition_df.join(
+            batu_health.select(['batu_id', 'state', 'vol_score', 'temp_score', 'std_score', 'station', 'dt']),
+            on=['batu_id', 'state', 'dt', 'station'], how='left')\
+            .withColumn('record_dt', F.col('dt'))\
+            .withColumnRenamed('end_dod', 'end_dod_record')\
+            .withColumnRenamed('end_soc', 'end_soc_record')\
+            .withColumnRenamed('end_vol', 'end_vol_record')\
+            .withColumnRenamed('vol_score', 'vol_record')\
+            .withColumnRenamed('temp_score', 'temp_record')\
+            .withColumnRenamed('std_score', 'std_record')\
+            .withColumn('dt', F.lit('3000-01-01'))
+    else:
+        change_flag = F.col('cond_1') + F.col('cond_2') + F.col('cond_3') >= 1
+        # update_regular = (0.1 < F.abs(F.col('end_vol') - F.col('end_vol_record')) < 0.18) & () & ()
+        mix_data = condition_df.drop('dt').join(record_data, on=['batu_id', 'state', 'station'], how='left')\
+            .join(batu_health.select(['batu_id', 'state', 'station', 'vol_score', 'temp_score', 'std_score']),
+                  on=['batu_id', 'state', 'station'], how='left')\
+            .withColumn('cond_3', F.when(F.col('end_vol') > F.col('end_vol_record'), 1).otherwise(0))\
+            .withColumn('end_dod_record', F.when(change_flag, F.col('end_dod')).otherwise(F.col('end_dod_record')))\
+            .withColumn('end_soc_record', F.when(change_flag, F.col('end_soc')).otherwise(F.col('end_soc_record')))\
+            .withColumn('end_vol_record', F.when(change_flag, F.col('end_vol')).otherwise(F.col('end_vol_record')))\
+            .withColumn('vol_record', F.when(change_flag, F.col('vol_score')).otherwise(F.col('vol_record')))\
+            .withColumn('temp_record', F.when(change_flag, F.col('temp_score')).otherwise(F.col('temp_record')))\
+            .withColumn('std_record', F.when(change_flag, F.col('std_score')).otherwise(F.col('std_record')))\
+            .withColumn('record_dt', F.when(change_flag, F.lit(date)).otherwise(F.col('record_dt')))
+    mix_data = mix_data.select(['batu_id', 'end_dod_record', 'end_soc_record', 'end_vol_record', 'vol_record',
+                                'temp_record', 'std_record', 'state', 'record_dt', 'dt', 'station'])\
+        .withColumnRenamed('batu_id', 'dev_id')
+    batu_health = mix_data.select(['dev_id', 'state', 'vol_record', 'temp_record', 'std_record', 'station'])\
+        .withColumnRenamed('dev_id', 'batu_id')\
+        .withColumnRenamed('vol_record', 'vol_score')\
+        .withColumnRenamed('temp_record', 'temp_score')\
+        .withColumnRenamed('std_record', 'std_score')\
+        .withColumn('dt', F.lit(date))
+
+    # 综合电池单元充电、放电过程中的一致性得分
+    batu_health = batu_health.groupby(['batu_id']).apply(udf_collections.calculate_batu_end_cons)
+
+    # 末端一致性记录写表
+    mix_data.write.format('hive').insertInto(configure.end_record_table, overwrite=True)
+
+    # 获取过去7日，电池单元soh
+    soh_window = Window.partitionBy(['batu_id'])
+    days = [str((pd.to_datetime(date) - relativedelta(days=i)).date()) for i in range(7)][1:]
+    soh_data = data_load.get_data_general(table=configure.soh_table, date=days).filter(F.col('pid_name') == 'RealSOH')
+
+    soh_data = soh_data.select(['dev_id', 'val']).filter(F.col('dev_id').isin(data_load.get_id_by_tier('batu')))\
+        .withColumnRenamed('dev_id', 'batu_id').withColumn('soh', F.mean('val').over(soh_window))\
+        .drop('val').dropDuplicates() # 计算过去7日平均soh
+
+    # 加权赋分
+    batu_health = batu_health.join(soh_data, on='batu_id', how='left') \
+        .withColumn('health_score', F.col('soh') * 0.35 + F.col('vol_score') * 0.3 + F.col('temp_score') * 0.2 +
+                    F.col('std_score') * 0.15) \
+        .select(['batu_id', 'temp_score', 'vol_score', 'std_score', 'soh', 'health_score', 'dt', 'station']).drop_duplicates()
+    return batu_health
+
+# 电池单元健康分
+def calculate_batu_health_score(date):
+    date_list = [str((pd.to_datetime(date) - relativedelta(days=i)).date()) for i in range(7)]
+    his_batu_health = data_load.get_data_general(configure.daily_batu_health_table, date = date_list)
+    his_batu_health = his_batu_health.filter((F.col('temp_score')<=100)&(F.col('temp_score')>=0))\
+        .filter((F.col('vol_score')<=100)&(F.col('vol_score')>=0))\
+        .filter((F.col('std_score')<=100)&(F.col('std_score')>=0))\
+        .filter((F.col('soh')<=100)&(F.col('soh')>=0))\
+        .filter((F.col('health_score')<=100)&(F.col('health_score')>=0))
+    batu_window = Window.partitionBy(['batu_id'])
+    batu_health = his_batu_health.withColumn('health_score', F.mean('health_score').over(batu_window)) \
+        .withColumn('temp_score', F.mean('temp_score').over(batu_window)) \
+        .withColumn('vol_score', F.mean('vol_score').over(batu_window)) \
+        .withColumn('std_score', F.mean('std_score').over(batu_window)) \
+        .withColumn('soh', F.mean('soh').over(batu_window)) \
+        .select(['batu_id', 'temp_score', 'vol_score', 'std_score', 'soh', 'health_score', 'station']) \
+        .drop_duplicates().withColumn('dt', F.lit(date)) \
+        .select(['batu_id', 'temp_score', 'vol_score', 'std_score', 'soh', 'health_score', 'dt', 'station'])
+    return batu_health
+
+
+# 单体健康分健康分
+def calculate_cell_health_score(date, cycles=None):
+    """
+    计算单体电池健康分，综合电压、温度、soh、内阻等一致性进行加权赋分
+    """
+    print("cycles:", cycles)
+    cell_health = data_load.get_data_general(configure.cell_health_1, date).withColumnRenamed('dev_id', 'cell_id')\
+        .select(['cell_id', 'cell_title', 'vol_cons', 'temp_cons', 'dt', 'station'])  # 单体电压、温度一致性数据
+    cell_resistance = data_load.get_data_general(configure.cell_resistance_table, date)  # 单体内阻数据
+    cell_capacity = data_load.get_data_general(configure.cell_capacity_table, date)  # 单体容量数据
+    # 单体容量超出装机容量，设置为装机容量
+    cell_max_cap = data_load.get_index('cell_max_capacity')
+    cell_capacity = cell_capacity.join(cell_max_cap, on=['station'], how='left')\
+        .withColumn('cap_cons', F.col('capacity') / F.col('cell_max_capacity') * 100)\
+        .withColumn('cap_cons', F.when(F.col('cap_cons') > 100, 100).otherwise(F.col('cap_cons')))
+
+    # 数据合并
+    on_col = ['dt', 'station', 'cell_id']
+    cell_feature = cell_health.join(cell_capacity, on=on_col, how='left').join(cell_resistance, on=on_col, how='left')
+    # 计算容量一致性
+    window = Window.partitionBy(['dt', 'station'])
+    cell_feature = cell_feature.withColumn('max_res', F.max('resistance').over(window)) \
+        .withColumn('min_res', F.min('resistance').over(window))\
+        .withColumn('res_cons', (F.col('resistance') - F.col('min_res')) /
+                    (F.col('max_res') - F.col('min_res')) * 40 + 60)\
+        .select(['cell_id', 'cell_title', 'vol_cons', 'temp_cons', 'cap_cons', 'res_cons', 'dt', 'station'])
+
+    # 一轮数据补充：根据临近，填补temp数据
+    cell_feature = cell_feature.withColumn('numb', F.split(F.col('cell_title'), '-')) \
+        .withColumn('v0', F.col('numb')[0]).withColumn('v1', F.col('numb')[1].astype(t.IntegerType()))
+    cell_feature = cell_feature.groupby(['v0', 'station', 'dt']).apply(udf_collections.fill_health_temp)
+    # 计算health_score
+    cell_health = cell_feature.\
+        withColumn('health_score', F.col('vol_cons') * configure.health_score_weights['vol_cons'] +
+                    F.col('temp_cons') * configure.health_score_weights['temp_cons']+
+                    F.col('cap_cons') * configure.health_score_weights['cap_cons'] +
+                    F.col('res_cons') * configure.health_score_weights['res_cons'])
+
+    # 二轮数据补充：会存在几个单体完全没有数据，可以通过拼上state表的数据，进行第二次检查，先获取state数据
+    batu_id = data_load.get_id_by_tier(tier='batu')
+    state = data_load.get_state(targets=['dev_id', 'station', 'dt'], date=date, dev_id=batu_id)\
+        .withColumnRenamed('dev_id', 'batu_id') # 电池单元状态表
+    rel = data_load.get_dev_rel(targets=['batu_id', 'cell_id', 'cell_title']
+                                ,date=date) #临时处理
+    state = state.join(rel, on=['batu_id'], how='left').dropDuplicates()\
+        .withColumn('numb', F.split(F.col('cell_title'), '-')).withColumn('v0', F.col('numb')[0])\
+        .withColumn('v1', F.col('numb')[1].astype(t.IntegerType())).drop('batu_id')
+    # 拼接state数据，填补health_score数据
+    cell_health = cell_health.join(state, on=['cell_id', 'cell_title', 'dt', 'station', 'numb', 'v0', 'v1'],
+                                   how='right')
+    cell_health = cell_health.groupby(['v0', 'station', 'dt']).apply(udf_collections.fill_health_score).drop('numb')\
+        .drop('v0').drop('v1')
+    # 至此，当日的cell_health_score处理完毕
+    print("至此，当日的cell_health_score处理完毕")
+
+    # 需要计算过去6天+今天的health_score平均值
+    dates = [str((pd.to_datetime(date) - relativedelta(days=i)).date()) for i in range(cycles)][1:]
+    extra_health_data = data_load.get_data_general(table=configure.cell_health_table, date=dates)
+    health_table = cell_health.unionByName(extra_health_data)
+    cell_health_score = health_table.groupby(['cell_id']).apply(udf_collections.avg_health_score)
+    print("计算过去6天+今天的health_score平均值处理完毕")
+    return cell_health_score
+
+# 一致性得分汇总
+def health_score(date, cycles=None):
+    consistency_weights = configure.consistency_weights  # 一致性权重
+    health_weights = configure.consistency_score_weights # 健康分权重
+    # 计算温度、电压一致性
+    temp_cons = cal_consistency(date=date, col_name='temp', weights=consistency_weights, cycles=cycles)
+    vol_cons = cal_consistency(date=date, col_name='vol', weights=consistency_weights, cycles=cycles)
+    # 数据合并，赋分
+    health_df = vol_cons.join(temp_cons, on=['batc_id', 'dev_id', 'cell_title', 'dt', 'station'], how='left')
+    health_df = health_df.withColumn('health_score', F.col('vol_cons') * health_weights['vol_weight']
+                                     + F.col('temp_cons') * health_weights['temp_weight'])
+
+    health_df = health_df.select(['batc_id', 'dev_id', 'cell_title', 'vol_cons', 'temp_cons', 'health_score', 'dt',
+                                  'station'])
+    return health_df
+
+# 微短路计算
+def cal_micro_circuit(date):
+    data = data_load.get_cell_base(targets=['batu_id', 'cell_id', 'vol', 'power', 'state', 'times', 'dt', 'station'],
+                                   date=date)
+    data = data_check.check_cell_data(data, 'vol')
+    window = Window.partitionBy(['cell_id', 'state'])
+    data = data.withColumn('max_times',F.max('times').over(window)).filter(F.col('max_times')==F.col('times'))\
+        .drop('times').drop('max_times')
+    # 计算5%分位数的数据
+    cols = ['station', 'batu_id', 'state', 'dt']
+    p = F.expr('percentile_approx(vol, {x})'.format(x=5/100))
+    percent_target = data.groupby(cols).agg(p.alias('percent_target'))
+    # 判断小于分位数的单体出现微短路
+    cell_low = data.join(percent_target, on=cols, how='left') \
+        .withColumn('micro_flag', F.when(F.col('vol') <= F.col('percent_target'), 1).otherwise(0))
+    cell_micro = cell_low.groupby(['cell_id', 'dt', 'station']).agg(F.sum('micro_flag').alias('micro_circuit')) \
+        .withColumn('micro_circuit', F.when(F.col('micro_circuit') > 1, 1).otherwise(0))
+
+    cell_micro = cell_micro.select(['cell_id', 'micro_circuit', 'dt', 'station'])
+    return cell_micro
+
+# 析锂计算
+def cal_lithium(date):
+    ica_data = cal_ica(date)
+    window = Window.partitionBy(['cell_id', 'dt'])
+    cell_max_vol = ica_data.withColumn('cell_max_vol', F.max('vol').over(window))
+    cell_lithium = cell_max_vol.groupby('cell_id').apply(udf_collections.analysis_lithium)
+
+    cell_lithium = cell_lithium.select(['cell_id', 'lithium', 'dt', 'station'])
+    return cell_lithium
+
+# 内阻计算
+def cal_resistance(date):
+    cell_base_data = data_load.get_cell_base(date=date, targets=['cell_id', 'state', 'times', 'vol', 'cur', 'dt', 'station'])\
+        .withColumn('hour', F.hour('times')).filter(F.col('state')=='charge')
+    cell_base_data = data_check.check_cell_data(data=cell_base_data, col_name='vol').dropDuplicates()
+    cell_resistance = cell_base_data.groupby(['cell_id']).apply(udf_collections.calculate_resistance)
+
+    cell_resistance = cell_resistance.select(['cell_id', 'resistance', 'dt', 'station'])
+    return cell_resistance
+
+# 单体容量
+def  cal_cell_capacity(date):
+    # 获取放电过程中单体的power、vol、soc
+    cell_base = data_load.get_cell_base(date, targets=['batu_id', 'batc_id', 'cell_id', 'vol', 'power', 'soc', 'state',
+                                                       'times', 'dt', 'station'])\
+        .filter(F.col('state') == 'discharge').withColumn('vol',F.round('vol', 3))
+
+    print("-------------------------------",len(cell_base.collect()))
+
+
+    cell_base = data_check.check_cell_data(cell_base, 'vol') # 数据检查
+    freq_index = data_load.get_index('freq').withColumn('freq', F.translate('freq','S',''))\
+        .withColumn('freq', F.translate('freq','s','').cast(t.IntegerType())) # 频率数据整理，方便计算
+    # 计算单体容量
+    cell_window = Window.partitionBy(['cell_id', 'dt'])
+    cell_base_data = cell_base.join(F.broadcast(freq_index), on=['station'], how='left')\
+        .dropna().withColumn('power_sum', F.sum('power').over(cell_window)) \
+        .withColumn('capacity', F.abs(F.col('power_sum')) / (3600 / F.col('freq')))
+    # grade_vol指的是放电末端电压，筛选达到满充满放标准的数据
+    charge_index = data_load.get_index('full_charge_index')
+    cell_base_data = cell_base_data.withColumn('grade_vol', F.min('vol').over(cell_window))\
+        .withColumn('start_soc', F.min('soc').over(cell_window)).withColumn('end_soc', F.max('soc').over(cell_window))\
+        .withColumn('soc_diff', F.col('end_soc')-F.col('start_soc'))\
+        .join(F.broadcast(charge_index), on=['station'], how='left')
+    cell_base_data = cell_base_data.cache()
+    cell_base_data.count()
+    cell_base_data = cell_base_data.filter(F.col('soc_diff')>F.col('full_charge_index'))
+
+    # 获取用于容量拼接的容量表
+    extra_cap = cell_base_data.groupby(['batu_id', 'dt']).apply(udf_collections.get_model_f)\
+        .withColumn('vol', F.col('vol').astype(t.StringType()))
+    # 获取最后时刻的数据
+    cell_capacity = cell_base_data.filter(F.col('vol') == F.col('grade_vol')) \
+        .withColumn('latest_time', F.max('times').over(cell_window)).filter(F.col('times') == F.col('latest_time'))
+    cell_capacity = cell_capacity.withColumn('grade_vol', F.col('grade_vol').astype(t.StringType()))
+    # 拼接额外容量数据，计算容量
+    cell_capacity = cell_capacity.join(F.broadcast(extra_cap), on=['batu_id', 'dt', 'vol'], how='left') \
+        .withColumn('final_cap', (F.col('extra_cap') + F.col('capacity')) * 100 / F.abs(F.col('soc_diff'))) \
+        .withColumnRenamed('capacity', 'min_cap').withColumnRenamed('final_cap', 'capacity')
+    # 数据整理，返回
+    cell_capacity = cell_capacity.select(['batu_id', 'cell_id', 'capacity', 'state', 'start_soc', 'end_soc',
+                                          'soc_diff', 'min_cap', 'dt', 'station'])
+    cell_base_data.unpersist()
+    return cell_capacity
+
+# 电池运维容量数据
+def capacity_estimate(date):
+    cell_capacity = data_load.get_data_general(table=configure.cell_capacity_table, date=date)
+    # 通过对近3个月放电SOC的统计，对异常或者不完整的放电记录进行剔除，避免对单体电池容量的测算造成影响
+    days = [str((pd.to_datetime(date) - relativedelta(days=i)).date()) for i in range(90)]
+    dev_soc = data_load.get_state(date=days, state='discharge')
+    p = F.expr('percentile_approx(soc_diff, {x})'.format(x=0.75))
+    percent = dev_soc.groupby('dev_id').agg(p.alias('median_v'))
+    batu_soc_normal = percent.select(['dev_id', 'median_v']).drop_duplicates().withColumnRenamed('dev_id', 'batu_id')
+    # 过滤soc变化低于中位数的单体容量数据
+    cell_capacity = cell_capacity.join(batu_soc_normal, on=['batu_id'], how='left')\
+        .filter(F.col('soc_diff') >= F.abs(F.col('median_v'))).drop('median_v')
+
+    rel = data_load.get_dev_rel(targets=['cell_id', 'batu_num', 'batc_num', 'cell_num'], date=date)
+    cell_capacity = cell_capacity.withColumnRenamed('dev_id', 'cell_id')\
+        .join(rel, on='cell_id', how='left')\
+        .withColumnRenamed('batu_num', 'unit_num')\
+        .withColumnRenamed('batc_num', 'cluster_num')\
+        .withColumnRenamed('cell_num', 'battery_num')\
+        .withColumn('battery_num', F.col('battery_num').astype(t.IntegerType()))\
+        .withColumn('cluster_num', F.col('cluster_num').astype(t.IntegerType()))\
+        .withColumn('unit_num', F.col('unit_num').astype(t.IntegerType()))\
+        .withColumn('capacity', F.col('capacity') / 1000)\
+        .select(['unit_num', 'cluster_num', 'battery_num', 'capacity', 'dt', 'station'])\
+        .withColumn('dt', F.lit('3000-01-01'))
+
+    return cell_capacity
+
+# ica计算
+def cal_ica(date):
+    data = data_load.get_cell_base(targets=['batu_id', 'cell_id', 'cur', 'vol', 'soc', 'state', 'times', 'dt',
+                                                'station'], date=date)
+    ica_data = data.filter(F.col('state')=='charge').groupby(['cell_id']).apply(udf_collections.calculate_ica)
+    ica_data = ica_data.withColumn('delta_q_v', F.col('delta_q') / F.col('delta_v'))
+
+    ica_data = ica_data.select(['cell_id', 'times', 'vol', 'qt', 'delta_q', 'delta_v', 'delta_q_v', 'station', 'dt'])
+    return ica_data
+
+# 一致性
+def cal_consistency(date, col_name, weights, cycles=None):
+    if cycles is None:
+        cycles = 1
+    # 获取cell_base表目标数据
+    dates = [str((pd.to_datetime(date) - relativedelta(days=i)).date()) for i in range(cycles)]
+    data = data_load.get_cell_base(date=dates, targets=['batu_id', 'batc_id', 'cell_id', col_name, 'state', 'times',
+                                                        'dt', 'station']).drop_duplicates()
+    data = data_check.check_cell_data(data=data, col_name=col_name)
+    # 获取充放电末端时刻数据（电压/温度）
+    window = Window.partitionBy(['cell_id', 'state'])
+    data = data.withColumn('max_times', F.max('times').over(window)).filter(F.col('max_times') == F.col('times')) \
+        .drop('times').drop('max_times')
+
+    # 数据量检查
+    data_check.check_cell_number(data)
+    # 3σ计算，计算是否在范围以外，超出范围则计1
+    col_window = Window.partitionBy(['batu_id', 'dt', 'state'])
+    data = data.withColumn('mean', F.mean(col_name).over(col_window)).withColumn('std', F.stddev(col_name).over(
+        col_window))
+    window = Window.partitionBy(['cell_id', 'batc_id'])
+    flag_data = data \
+        .withColumn('once_flag', F.when((F.col(col_name) >= (F.col('mean') - F.col('std'))) &
+                                        (F.col(col_name) <= (F.col('mean') + F.col('std'))), 0).otherwise(1)) \
+        .withColumn('twice_flag', F.when((F.col(col_name) >= (F.col('mean') - 2 * F.col('std'))) &
+                                         (F.col(col_name) <= (F.col('mean') + 2 * F.col('std'))), 0).otherwise(1)) \
+        .withColumn('triple_flag', F.when((F.col(col_name) >= (F.col('mean') - 3 * F.col('std'))) &
+                                          (F.col(col_name) <= (F.col('mean') + 3 * F.col('std'))), 0).otherwise(1))\
+        .withColumn('denominator', F.count('cell_id').over(window))
+
+    # 统计超出范围的次数
+    flag_data = flag_data.withColumn('once_sum', F.sum('once_flag').over(window)) \
+        .withColumn('twice_sum', F.sum('twice_flag').over(window)) \
+        .withColumn('triple_sum', F.sum('triple_flag').over(window)) \
+        .select(['cell_id', 'batc_id', 'once_sum', 'twice_sum', 'triple_sum', 'dt',
+                 'station', 'denominator']).drop_duplicates()  # 这里不对，要用 cell_id 和 batc_id
+    # 计算一致性得分
+    flag_data = flag_data.withColumn('num', (F.col('once_sum') * weights[0]  + F.col('twice_sum') * weights[1]
+                                             + F.col('triple_sum') * weights[2])/F.col('denominator') ) \
+        .withColumn('{}_cons'.format(col_name), (1 - F.col('num')) * 100)
+    # 修改格式
+    rel = data_load.get_dev_rel(targets=['cell_id', 'cell_title'], date=dates)
+    flag_data = flag_data.join(rel, on=['cell_id'], how='left').withColumnRenamed('cell_id', 'dev_id')
+
+    flag_data = flag_data.select(['batc_id', 'dev_id', 'cell_title', '{}_cons'.format(col_name), 'dt', 'station'])
+    return flag_data
+
+# 单体容量（计算分数）
+def cal_cell_capacity_conf(date, cycles):
+    days = [str((pd.to_datetime(date) - relativedelta(days=i)).date()) for i in range(cycles)]
+    cell_capacity = data_load.get_data_general(table=configure.cell_capacity_table, date=days)
+    # 这里除了加权平均容量的计算，也可以使用三西格玛进行评分。
+    cell_capacity = cell_capacity.groupby(['cell_id']).apply(udf_collections.get_conf)
+
+    batu_window = Window.partitionBy(['batu_id', 'dt'])
+    cell_capacity = cell_capacity.withColumn('upper_cap', F.col('mean_cap') * 1.05)\
+        .withColumn('lower_cap', F.col('mean_cap') * 0.95).withColumn(
+        'mark', F.when((F.col('capacity') > F.col('lower_cap')) & (F.col('capacity') < F.col('upper_cap')), 1)
+            .otherwise(0))
+    cell_total_index = data_load.get_index('cell_total')
+    cell_max_index = data_load.get_index('cell_max_capacity')
+    cell_capacity = cell_capacity.join(F.broadcast(cell_max_index), on=['station'], how='left')\
+        .join(F.broadcast(cell_total_index), on=['station'], how='left')\
+        .withColumn('conf_score', F.round((F.sum('mark').over(batu_window))*100/F.col('cell_total'),2))\
+        .withColumn('capacity', F.when(F.col('capacity')>F.col('cell_max_capacity'), F.col('cell_max_capacity'))
+                    .otherwise(F.col('capacity'))).drop('upper_cap').drop('lower_cap').drop('mean_cap').drop('mark')
+    cell_capacity = cell_capacity.select(['batu_id', 'cell_id', 'capacity', 'state', 'soc_diff', 'min_cap',
+                                              'conf_score', 'dt', 'station'])
+    return cell_capacity
--- a/battery_health/functions/health_score/health_score_data_prep.py
+++ b/battery_health/functions/health_score/health_score_data_prep.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+from os import setegid
+from battery_health import data_load, configure, udf_collections
+import pyspark.sql.functions as F
+from pyspark.sql import Window
+from battery_health.functions.data_verification import data_check
+
+
+id_dict = {'Cell': 'cell_id', 'BatteryCluster': 'batc_id'}
+val_dict = {'SOC': 'soc', 'Voltage': 'vol', 'Current': 'cur', 'Temperature': 'temp'}
+
+
+def data_prep(date, station):
+    """
+    这里是完成电池健康分计算的数据预处理，写入 cell_base_data
+    层级：cell
+    数据内容：[temp、soc、col、cur、power]
+    时间范围：当日最大的充电、放电记录时间。
+    """
+    # 获取当前电站充放电状态，目的是：获取最大一次的充电、放电记录
+    state = data_load.get_state(date=date, station=station,
+                                dev_id=configure.general_map[station]['id']['batu']['dev_id'])
+    if data_check.check_state_data(state):
+        print('Station:',station,'is Empty!')
+        return 0
+    window = Window.partitionBy(['state', 'dt', 'dev_id'])
+    state = state.withColumn('charge_flag', F.max('soc_diff').over(window))\
+        .withColumn('discharge_flag', F.min('soc_diff').over(window))
+    
+    charge_state = state.filter((F.col('soc_diff') == F.col('charge_flag'))&(F.col('state') == 'charge'))
+    discharge_state = state.filter((F.col('soc_diff') == F.col('discharge_flag'))&(F.col('state') == 'discharge'))
+    state = charge_state.unionByName(discharge_state)
+    state.show()
+
+    # 获取充放电对应的小时，目的是一定程度上的减少数据量
+    state_hour = state.groupby(['dev_id', 'state']).apply(udf_collections.get_state_hour)\
+        .withColumnRenamed('dev_id', 'batu_id')
+    state_hour.show()
+
+    # 获取对应数据
+    temp_data = get_data_vol(date=date, station=station, tier='Cell', state_hour=state_hour, pid_name='Temperature')
+    volt_data = get_data_vol(date=date, station=station, tier='Cell', state_hour=state_hour, pid_name='Voltage')
+    cur_data = get_data_vol(date=date, station=station, tier='BatteryCluster', state_hour=state_hour, pid_name='Current')
+    soc_data = get_data_vol(date=date, station=station, tier='BatteryCluster', state_hour=state_hour, pid_name='SOC')
+    if station == 'hnc':
+        cur_data = cur_data.withColumn('cur', F.col('cur')*-1)
+    # 根据关系结构，合并数据
+    rel = data_load.get_dev_rel(date=date, targets='cell_id, batc_id, batu_id',
+                                station_id_list=configure.general_map[station]['details']['station_id'])
+    data = volt_data.join(rel, on=['cell_id'], how='left')\
+        .join(cur_data, on=['times','dt','station','batc_id'], how='full')\
+        .join(soc_data, on=['times','dt','station','batc_id'], how='full')\
+        .join(temp_data, on=['times','dt','station','cell_id'], how='full')
+
+    data = data.withColumn('power', F.col('cur') * F.col('vol')) # 计算功率
+
+    # 根据具体充放电起止时间，筛选数据
+    state = state.withColumnRenamed('dev_id', 'batu_id')
+    charge_data = data.join(
+        F.broadcast(state.filter(F.col('state') == 'charge').select(['batu_id', 'state', 'start_time', 'end_time'])),
+        on='batu_id', how='left')\
+        .filter((F.col('times') >= F.col('start_time')) & (F.col('times') <= F.col('end_time')))
+
+    discharge_data = data.join(
+        F.broadcast(state.filter(F.col('state') == 'discharge').select(['batu_id', 'state', 'start_time', 'end_time'])),
+        on='batu_id', how='left')\
+        .filter((F.col('times') >= F.col('start_time')) & (F.col('times') <= F.col('end_time')))
+
+    # 数据整合
+    base_data = charge_data.unionByName(discharge_data).withColumn('dt',F.lit(date))
+    base_data = base_data.select(['batu_id', 'batc_id', 'cell_id', 'times', 'state', 'vol', 'cur', 'power', 'soc',
+                                  'temp', 'station', 'dt', ]).drop_duplicates()
+    return base_data
+
+
+def get_data_vol(date, station, tier, state_hour, pid_name):
+    """
+    获取目标层级，在对应的充放电过程中的数据
+    """
+    # 获取有数据的hour
+    sh = state_hour.select(['dt', 'hour','state']).toPandas()
+    hour_list = list(sh[sh['dt'] == max(sh.dt.unique())].hour)
+    hour_list = list(set(hour_list))
+
+    # 获取dev_pid数据
+    dev_pid = data_load.get_dev_pid(pid_name=pid_name, date=date, targets=['dev_id', 'pid'], tier=tier,
+                                     station_id_list=configure.general_map[station]['details']['station_id'])
+    pid_list = list(dev_pid.toPandas().pid)
+
+    # 根据pid和hour获取数据
+    data = data_load.get_ods_data(pid=pid_list, date=date, targets=['pid', 'val', 'times', 'dt', 'station', 'hour'],
+                                      station=station, hour=hour_list)
+
+    # 获取昨日数据（dt>1意味着存在跨天数据）
+    if len(sh.dt.unique())>1:
+        print("存在跨天数据")
+        yes_date = sh.dt.min()
+        hour_list = list(sh[sh['dt'] == yes_date].hour)
+        hour_list = list(set(hour_list))
+        extra_data = data_load.get_ods_data(pid=pid_list, date=yes_date, station=station, hour=hour_list,
+                                            targets=['pid', 'val', 'times', 'dt', 'station', 'hour'])
+        data = data.unionByName(extra_data)
+
+    data = data.join(state_hour, on=['dt', 'hour'], how='left') # 添加state数据
+
+    # 重采样，填补数据
+    freq_df = data_load.get_index('freq')
+    data = data.join(freq_df, on=['station'], how='left').groupby(['pid', 'state'])\
+        .apply(udf_collections.fill_time_by_resample)
+    # 数据整理，return
+    data = data.withColumnRenamed('val',val_dict[pid_name]).join(dev_pid, on=['pid'],how='left')\
+        .drop('pid').withColumnRenamed('dev_id',id_dict[tier])
+    return data
+
+def get_cell_base(date):
+    # 分电站进行数据预处理，原因是多电站并行数据量过大，会引发长时间GC导致服务崩溃。
+    for i in configure.station_list:
+        data = data_prep(date=date, station=i)
+        if data != 0:
+            data.write.mode('overwrite').insertInto(configure.cell_base_table, overwrite=True)
+            print(i, "cell_base prepared")
+
+
+
--- a/battery_health/functions/state_recognition/__init__.py
+++ b/battery_health/functions/state_recognition/__init__.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+'''
+基于SOC和功率（电压*电流）识别储能单元、电池单元、电池簇的充放电起止时间。顺便通过起止时间计算这几个层级的容量。
+计算目标：[开始时间，截止时间，充放电状态，soc变化量，是否满充满放][容量]
+'''
+
--- a/battery_health/functions/state_recognition/__pycache__/__init__.cpython-36.pyc
+++ b/battery_health/functions/state_recognition/__pycache__/__init__.cpython-36.pyc
--- a/battery_health/functions/state_recognition/__pycache__/capacity_calculation.cpython-36.pyc
+++ b/battery_health/functions/state_recognition/__pycache__/capacity_calculation.cpython-36.pyc
--- a/battery_health/functions/state_recognition/__pycache__/state_data_prep.cpython-36.pyc
+++ b/battery_health/functions/state_recognition/__pycache__/state_data_prep.cpython-36.pyc
--- a/battery_health/functions/state_recognition/__pycache__/state_recognition.cpython-36.pyc
+++ b/battery_health/functions/state_recognition/__pycache__/state_recognition.cpython-36.pyc
--- a/battery_health/functions/state_recognition/capacity_calculation.py
+++ b/battery_health/functions/state_recognition/capacity_calculation.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+import pyspark.sql.functions as F
+from pyspark.sql import Window
+from battery_health import data_load, configure
+
+def calculate_dev_capacity(date):
+    """
+    计算dev_data中的容量
+    层级：[enu, batu, batc]
+    """
+    data = data_load.get_data_general(configure.dev_table, date)
+    state = data_load.get_state(date)
+    state = state.select(['dev_id', 'dt', 'state', 'start_time', 'end_time'])
+
+    window = Window.partitionBy(['dev_id', 'dt', 'state', 'start_time', 'end_time'])
+    capacity_df = data.join(F.broadcast(state), on=['dev_id', 'dt'], how='inner')
+    # 计算容量
+    capacity_df = capacity_df.filter((F.col('times') >= F.col('start_time')) & (F.col('times') <= F.col('end_time')))\
+        .withColumn('power_sum', F.sum('power').over(window))\
+        .withColumn('capacity', F.abs(F.col('power_sum')) / (3600 / 10))
+    capacity_df = capacity_df.select(['dev_id', 'state', 'start_time', 'capacity', 'dt', 'station']).drop_duplicates()
+
+    return capacity_df
\ No newline at end of file
--- a/battery_health/functions/state_recognition/state_data_prep.py
+++ b/battery_health/functions/state_recognition/state_data_prep.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+from battery_health import data_load, configure, udf_collections
+import pyspark.sql.functions as F
+import pandas as pd
+from dateutil.relativedelta import relativedelta
+from battery_health.functions.data_verification import data_check
+RENAME_DICT = {'SOC':'soc', 'Voltage':'vol', 'Current':'cur', 'ActivePower':'power'}
+
+
+# 用于 dev_data 的计算，获取指定数据类型的所有数据（所有电站、所有层级）
+def get_dev_data(pid_name, date, yesterday_hour = None, freq=None):
+    """
+    这个方法只获取一种数据
+    如何描述一个从元数据表中获取数据的过程：”从元表数据中获取：A电站，B日期，C层级的D类数据“
+    这里虽然做成可以支持多个数据一起查询的格式，但是建议还是单个数据进行查询
+    :param yesterday_hour:
+    :param date:
+    :type pid_name: 支持list或者str的传入
+    :param freq: 默认10s
+    :return: ['dev_id', 'times', 'val', 'dt', 'station']
+    """
+    # 采样频率的临时解决方案，暂定10s，要根据电站做个性化的话，得另写UDF。
+
+    # 从点号设备表重获取，想要的数据类型(pid_name)的所有点号。
+    dev_pid = data_load.get_dev_pid(pid_name=pid_name, date=date).select(['pid', 'dev_id'])
+    pid_list = list(dev_pid.toPandas()['pid'])
+
+    # 这里是对hnc的电池单元有功功率绑在PCS上的临时解决方案
+    if pid_name == 'ActivePower':
+        try:
+            hnc_id = [configure.general_map['hnc']['details']['station_id']]
+
+            dc_dev_pid = data_load.get_dev_pid(station_id_list = hnc_id, tier=['PCS'], pid_name=pid_name,
+                                               date=date).select(['pid', 'dev_id'])\
+                .replace(['628549'], ['628607'], 'dev_id')
+
+            dev_pid = dev_pid.unionByName(dc_dev_pid)
+            pid_list = list(dev_pid.toPandas()['pid'])
+            print('change hnc batc_id from 628549 to 628607 and use DcActivePower.')
+        except pyspark.sql.utils.ParseException:
+            print('Empty DcActivePower of PCS !!!')
+
+    # 获取当日基础数据
+    data = data_load.get_ods_data(targets=['pid', 'val', 'times', 'dt', 'station'], pid=pid_list, date=date)
+    last_date = str((pd.to_datetime(date[0]) - relativedelta(days=1)).date())
+    last_date = str(last_date).replace('[', '(').replace(']', ')')
+    # 如果有对昨日数据的需求，则触发这部分
+    if yesterday_hour is not None:
+        # 获取昨天的数据，对每个电站都取昨天的后n小时。
+        extra_data = data_load.get_ods_data(targets=['pid', 'val', 'times', 'dt', 'station'], pid=pid_list,
+                                            date=last_date, hour=yesterday_hour)
+        # 数据合并
+        data = data.unionByName(extra_data)
+    else :
+        extra_data = data_load.get_ods_data(targets=['pid', 'val', 'times', 'dt', 'station'], pid=pid_list,
+                                            date=last_date)
+        # 数据合并
+        data = data.unionByName(extra_data)
+    # 按预设频率重采样，修改列名方便后续join操作。
+    freq_df = data_load.get_index('freq')
+    data = data.join(freq_df, on=['station'], how='left').groupby('pid').apply(udf_collections.fill_time_by_resample)
+    data = data.join(dev_pid, on=['pid'], how='left').withColumnRenamed('val', RENAME_DICT[pid_name]).drop('pid')
+
+    return data
+
+# dev_data 数据整合，把无法获取的power用cur*vol算取
+def data_prep(date, yesterday_hours = None):
+    """
+
+    :param date: 日期
+    :param yesterday_hours: 昨日的最后n个小时的数据，不输入或输入[0,23]之外的数，都可以使这个参数失效
+    :return: dev_data
+    """
+
+    # 计算昨天的最后n小时，日期格式的统一。
+    if yesterday_hours is not None:
+        yes_range = [str(24-i) for i in range(yesterday_hours)]
+    else:
+        yes_range = None
+    if type(date)==type(''):
+        date = [date]
+
+    # 获取四个类型的数据。
+    soc_data = get_dev_data(pid_name='SOC', date=date, yesterday_hour=yes_range)
+    power_data = get_dev_data(pid_name='ActivePower', date=date, yesterday_hour=yes_range)
+    cur_data = get_dev_data(pid_name='Current', date=date, yesterday_hour=yes_range)
+    vol_data = get_dev_data(pid_name='Voltage', date=date, yesterday_hour=yes_range)
+
+    # 数据合并，计算功率。
+    same_columns = ['dev_id', 'times', 'dt', 'station']
+    data = soc_data.join(cur_data, on=same_columns, how='left').join(vol_data, on=same_columns, how='left')\
+        .join(power_data, on=same_columns, how='left').withColumn('dt', F.lit(date[0]))
+    data = data.withColumn('power',F.when(F.col('power').isNull(),F.col('vol') * F.col('cur') / 1000)
+                             .otherwise(F.col('power'))).drop('cur').drop('vol').dropna()
+    data = data.withColumn('power',F.when(F.col('station')=='hnc',F.col('power')*-1).otherwise(F.col('power')))
+    data_check.check_dev_data(data)
+    data = data.select(['dev_id', 'times', 'soc', 'power', 'station', 'dt'])
+    return data
+
--- a/battery_health/functions/state_recognition/state_recognition.py
+++ b/battery_health/functions/state_recognition/state_recognition.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+from battery_health import udf_collections, configure, data_load
+import pyspark.sql.functions as F
+import pyspark.sql.types as t
+
+
+def calculate_state(date):
+    data = data_load.get_data_general(configure.dev_table, date)
+    # 获取需要的设备ID，这里写的不够好，之后再优化。（hnc不需要batc，batu也是用的配置中的BATU_ID）
+    dev_list = configure.dev_id_list
+
+    # 获取其他指标：阈值、满充满放判断依据。拼接方便后续操作。
+    full_ind_df = data_load.get_index(index='full_charge_index')
+    threshold_df = data_load.get_index(index='threshold')
+
+    index_df = full_ind_df.join(threshold_df, on=['station'],how='left')
+    prep_data = data.join(index_df, on=['station'],how='left')
+
+    prep_data = prep_data.withColumn('power', F.when(F.abs(F.col('power')) < F.col('threshold'), 0)
+                                     .otherwise(F.col('power')))
+    # 目前只完成了基础的计算, 需要做个数据校验
+    zero_df = prep_data.filter(F.col('dev_id').isin(dev_list)).groupby(['dev_id']).apply(udf_collections.get_state)
+    zero_df  = zero_df.withColumn('dt', F.col('end_time').astype(t.DateType())).filter(F.col('dt')==date)
+
+    zero_df = zero_df.select(['dev_id', 'start_time', 'end_time', 'state', 'soc_diff', 'full_charge', 'dt', 'station'])
+
+    return zero_df
+
--- a/battery_health/functions/statistic_data/__init__.py
+++ b/battery_health/functions/statistic_data/__init__.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+'''
+基于数据，对于数据进行二次加工，探索数学层面的运行情况。
+计算目标：[充电末端soc、放电末端soc、充电末端最大电压差（batu）、放电末端最大电压差（batu）、充电末端电压差（cell）、放电末端电压差（cell）、
+        充电末端电压标准差（batu）、放电末端电压标准差（batu）、充电末端电压偏离度（cell）、放电末端电压偏离度（cell）]
+'''
\ No newline at end of file
--- a/battery_health/functions/statistic_data/__pycache__/__init__.cpython-36.pyc
+++ b/battery_health/functions/statistic_data/__pycache__/__init__.cpython-36.pyc
--- a/battery_health/functions/statistic_data/__pycache__/statistic_calculation.cpython-36.pyc
+++ b/battery_health/functions/statistic_data/__pycache__/statistic_calculation.cpython-36.pyc
--- a/battery_health/functions/statistic_data/__pycache__/statistic_data_prep.cpython-36.pyc
+++ b/battery_health/functions/statistic_data/__pycache__/statistic_data_prep.cpython-36.pyc
--- a/battery_health/functions/statistic_data/statistic_calculation.py
+++ b/battery_health/functions/statistic_data/statistic_calculation.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+from battery_health.functions.data_verification import data_check
+from battery_health import data_load, configure
+from dateutil.relativedelta import relativedelta
+import pyspark.sql.functions as F
+from pyspark.sql import Window
+import pandas as pd
+
+
+def get_state_val(data, state, pid_name):
+    # 获取指定状态的，指定数据类型。
+    name_dict = {'vol_diff': 'EndVoltageDiff', 'deviation': 'EndVoltageDeviation', 'vol_range': 'EndMaxVoltDiff',
+                 'std_dev': 'EndVoltSTD', 'soc': 'EndSOC'}
+    state_dict = {'charge': 'Charge', 'discharge': 'Discharge'}
+    data = data.select(['dev_id', 'state', pid_name, 'dt', 'station'])
+    data = data.filter(F.col('state')==state).withColumnRenamed(pid_name, 'val')\
+        .withColumn('pid_name', F.lit('{state}{pid_name}'.format(state=state_dict[state], pid_name=name_dict[pid_name])))
+    data = data.select(['dev_id', 'pid_name', 'val', 'dt', 'station'])
+    return data
+
+
+def get_statistic(date, cycles):
+    days = [str((pd.to_datetime(date) - relativedelta(days=i)).date()) for i in range(cycles)]
+    data = data_load.get_data_general(configure.statistic_mid_table, date=days)
+    # 充电末端SOC需要超过一定范围
+    data = data_check.check_soc_data(data=data, col_name='soc')
+    # 检查电压数据正确性
+    data = data_check.check_cell_data(data=data, col_name='vol')
+    window = Window.partitionBy(['batu_id', 'state', 'dt', 'times'])
+    window_dt = Window.partitionBy(['batu_id', 'state', 'dt'])
+
+    analysis_data = data.withColumn('std_dev', F.stddev('vol').over(window)) \
+        .withColumn('max_dev', F.max('std_dev').over(window_dt)).filter(F.col('std_dev') == F.col('max_dev'))
+    analysis_data = analysis_data.cache()
+    analysis_data.count()
+    # 计算电池单元健康
+
+    batu_health = analysis_data.withColumn('vol_range', F.max('vol').over(window_dt) - F.min('vol').over(window_dt)) \
+        .withColumnRenamed('batu_id', 'dev_id')\
+        .select(['dev_id', 'state', 'soc', 'vol_range', 'std_dev', 'dt', 'station']).drop_duplicates()
+    # 计算单体电池健康
+    cell_health = analysis_data.withColumn('max_vol', F.max('vol').over(window_dt)) \
+        .withColumn('min_vol', F.min('vol').over(window_dt)) \
+        .withColumn('vol_diff', F.when(F.col('state') == 'charge', F.col('max_vol') - F.col('vol'))
+                    .otherwise(F.col('vol') - F.col('min_vol'))) \
+        .withColumn('mean_vol', F.mean('vol').over(window_dt)) \
+        .withColumn('deviation', (F.col('vol') - F.col('mean_vol')) / F.col('mean_vol')) \
+        .withColumnRenamed('cell_id', 'dev_id') \
+        .select(['dev_id', 'state', 'soc', 'vol_diff', 'deviation', 'dt', 'station'])
+
+    batu_health = batu_health.cache()
+    batu_health.count()
+    cell_health = cell_health.cache()
+    cell_health.count()
+    # 电池单元和单体:充放电末端Soc
+    batu_charge_soc = get_state_val(data=batu_health, state='charge', pid_name='soc')
+    batu_discharge_soc = get_state_val(data=batu_health, state='discharge', pid_name='soc')
+    cell_charge_soc = get_state_val(data=cell_health, state='charge', pid_name='soc')
+    cell_discharge_soc = get_state_val(data=cell_health, state='discharge', pid_name='soc')
+    # 电池单元:充放电末端电压极差、充放电末端电压标准差
+    batu_diff_charge = get_state_val(batu_health, state='charge', pid_name='vol_range')
+    batu_diff_discharge = get_state_val(batu_health, state='discharge', pid_name='vol_range')
+    batu_std_charge = get_state_val(batu_health, state='charge', pid_name='std_dev')
+    batu_std_discharge = get_state_val(batu_health, state='discharge', pid_name='std_dev')
+    # 电池单体:充放电末端电压差、充放电末端电压偏离度
+    cell_vol_diff_charge = get_state_val(data=cell_health, state='charge', pid_name='vol_diff')
+    cell_vol_diff_discharge = get_state_val(data=cell_health, state='discharge', pid_name='vol_diff')
+    cell_vol_deviation_charge = get_state_val(data=cell_health, state='charge', pid_name='deviation')
+    cell_vol_deviation_discharge = get_state_val(data=cell_health, state='discharge', pid_name='deviation')
+
+    # 数据整合
+    soc_data = batu_charge_soc.unionByName(batu_discharge_soc).unionByName(cell_charge_soc) \
+        .unionByName(cell_discharge_soc)
+    batu_data = batu_diff_charge.unionByName(batu_diff_discharge).unionByName(batu_std_charge) \
+        .unionByName(batu_std_discharge)
+    cell_data = cell_vol_diff_charge.unionByName(cell_vol_diff_discharge).unionByName(cell_vol_deviation_charge)\
+        .unionByName(cell_vol_deviation_discharge)
+    data = soc_data.unionByName(batu_data).unionByName(cell_data)
+
+    # 处理时间为时间戳
+    data = data.withColumn('dt', F.lit(date))
+    data = data.withColumn('times', F.unix_timestamp(F.col('dt'),"yyyy-MM-dd")*1000).withColumn('dt', F.lit(date))
+    data = data.select(['dev_id', 'pid_name', 'times', 'val', 'dt', 'station'])
+
+    analysis_data.unpersist()
+    batu_health.unpersist()
+    cell_health.unpersist()
+
+    return data
+
+
--- a/battery_health/functions/statistic_data/statistic_data_prep.py
+++ b/battery_health/functions/statistic_data/statistic_data_prep.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+from battery_health import data_load, configure, udf_collections
+from battery_health.functions import data_verification
+from battery_health.functions.data_verification import data_check
+import pyspark.sql.functions as F
+import pandas as pd
+import numpy as np
+
+
+def get_tier_data(date, pid_name, tier, state, station_list=None):
+    print("-----------------------------------pid_name:", pid_name)
+
+    if station_list is None:
+        station_list = configure.station_list
+    # 状态-小时表
+    state_hour = state.withColumn('hour', F.substring(F.col('end_time'), 12, 2)).select(['station', 'hour']).dropDuplicates()
+    station_id = [configure.general_map[i]['details']['station_id'] for i in station_list]
+    # 点号设备信息
+    dev_pid = data_load.get_dev_pid(pid_name=pid_name, targets=['dev_id', 'pid'], date=date, tier=tier, station_id_list=station_id)
+    pid = list(dev_pid.toPandas().pid.unique())
+    # 先获取全部数据，再根据'hour'筛选出对应小时的数据。
+    data = data_load.get_ods_data(pid=pid, date=date, targets=['pid', 'val', 'times', 'dt', 'station', 'hour'], station=station_list)
+    data = data.join(F.broadcast(state_hour), on=['station','hour'], how='inner')
+
+    # 数据修补，防止填补数据填补不到起止时刻的情况，把起止时刻拼进数据中
+    state_data = state.withColumnRenamed('end_time', 'times').join(dev_pid, on=['dev_id'], how='left') \
+        .select(['pid', 'times', 'dt', 'station']).withColumn('val', F.lit(None).cast('double')) \
+        .withColumn('hour', F.substring(F.col('times'), 12, 2))
+
+    #  unionByName 前先过滤掉 state_data 里的空主键行
+    state_data = state_data.filter(
+        (F.col('pid').isNotNull()) & (F.col('dt').isNotNull()) & (F.col('hour').isNotNull())
+    )
+    # union
+    data = data.unionByName(state_data)
+
+    # 模拟数据代码开始-------------------------------------------------------------
+    # from pyspark.sql import SparkSession
+    # from pyspark.sql.types import StructType, StructField, StringType, DoubleType
+    #
+    # # 初始化 Spark
+    # spark = SparkSession.builder.master("local[*]").appName("test").getOrCreate()
+    #
+    # # 定义 schema
+    # schema = StructType([
+    #     StructField("station", StringType(), True),
+    #     StructField("hour", StringType(), True),
+    #     StructField("pid", StringType(), True),
+    #     StructField("val", DoubleType(), True),
+    #     StructField("times", StringType(), True),
+    #     StructField("dt", StringType(), True),
+    # ])
+    #
+    # # 构造数据
+    # data_list = [
+    #     ("hnc", "16", "111510044001206", 3.197, "2024-10-15 16:00:03", "2024-10-15"),
+    #     ("hnc", "16", "111510044001206", 3.199, "2024-10-15 16:00:03", "2024-10-15"),
+    #     ("hnc", "16", "111510044001209", 3.194, "2024-10-15 16:00:03", "2024-10-15"),
+    #     ("hnc", "16", "111510044001217", 3.194, "2024-10-15 16:00:03", "2024-10-15"),
+    #     ("hnc", "16", "111510044001221", 3.197, "2024-10-15 16:00:03", "2024-10-15"),
+    #     ("hnc", "16", "111510044001226", 3.200, "2024-10-15 16:00:03", "2024-10-15"),
+    #     ("hnc", "16", "111510044001221", None, "2024-10-15 16:00:03", "2024-10-15"),
+    #     ("hnc", "16", "111510044001231", 3.201, "2024-10-15 16:00:03", "2024-10-15"),
+    #     ("hnc", "16", "111510044001231", None, "2024-10-15 16:00:03", "2024-10-15"),
+    #     ("hnc", "16", "111510044001231", 3.201, "2024-10-15 16:00:03", "2024-10-15"),
+    # ]
+    #
+    # # 创建 DataFrame
+    # df = spark.createDataFrame(data_list, schema)
+    # df.show()
+    # df = df.groupby(['pid', 'dt', 'hour']).apply(udf_collections.statistic_data_adjustment)
+    # df.show()
+    # data = df
+    # 模拟数据代码结束-------------------------------------------------------------
+
+    # 处理起止时刻是时间序列中最后时刻的情况
+    data = data.groupby(['pid', 'dt', 'hour']).apply(udf_collections.statistic_data_adjustment)
+
+    # 重采样，补数据
+    freq = data_load.get_index('freq')
+    data = data.join(F.broadcast(freq), on=['station'], how='left').groupby(['pid', 'dt', 'hour'])\
+        .apply(udf_collections.fill_time_by_resample)
+
+    # 数据整理
+    data = data.join(dev_pid,on=['pid'], how='left').drop('pid')
+
+    s = state.select(['dev_id', 'end_time', 'state', 'dt', 'station']).withColumnRenamed('end_time' ,'times')
+    data = data.join(F.broadcast(s), on=['times', 'dt', 'station', 'dev_id'], how='right')
+
+    return data
+
+
+def data_prep_station(date):
+    batu_id = data_load.get_id_by_tier(tier='batu')
+
+    state = data_load.get_state(date=date, dev_id=batu_id) # 电池单元状态表
+    state = state.cache()
+    state.count()
+
+    rel = data_load.get_dev_rel(date=date, targets=['batu_id', 'cell_id']) # 关系表
+
+    cell_state = state.join(rel.withColumnRenamed('batu_id', 'dev_id'), on=['dev_id'], how='full') \
+        .drop('dev_id').withColumnRenamed('cell_id', 'dev_id').dropna() # 单体状态表
+
+    # Volt，所有Cell都可以获取到电压数据。
+    vol_data = get_tier_data(date=date, pid_name='Voltage', tier='Cell', state=cell_state) \
+        .withColumnRenamed('dev_id', 'cell_id').withColumnRenamed('val', 'vol')
+    vol_data = data_verification.data_check.check_cell_data(data=vol_data, col_name='vol')
+    vol_data = vol_data.join(rel, on=['cell_id'], how='left')
+
+    soc_data = get_tier_data(date=date, pid_name='SOC', tier='BatteryUnit', state=state,
+                                  station_list=configure.station_list) \
+        .withColumnRenamed('dev_id', 'batu_id').withColumnRenamed('val', 'soc')
+    soc_data = soc_data.join(rel, on=['batu_id'], how='left')
+
+    '''
+    # cell_SOC，假设所有Cell都能获取到SOC数据。
+    cell_soc_data = get_tier_data(date=date, pid_name='SOC', tier='Cell', state=cell_state,
+                                  station_list=configure.station_list) \
+        .withColumnRenamed('dev_id', 'cell_id').withColumnRenamed('val', 'cell_soc')
+    cell_soc_data = cell_soc_data.join(rel, on=['cell_id'], how='left')
+    # batu_SOC，再获取一遍电池单元层级的SOC数据，后续计算要用到
+    batu_soc_data = get_tier_data(date=date, pid_name='SOC', tier='BatteryUnit', state=state,
+                                  station_list=configure.station_list) \
+        .withColumnRenamed('dev_id', 'batu_id').withColumnRenamed('val', 'batu_soc')
+    batu_soc_data = batu_soc_data.join(rel, on=['batu_id'], how='left')
+    soc_data = batu_soc_data.join(cell_soc_data, on=['batu_id', 'cell_id', 'times', 'dt', 'station', 'state'],
+                                  how='left')
+    '''
+
+    # 数据整合，写表
+    data = soc_data.join(vol_data, on=['batu_id', 'cell_id', 'times', 'dt', 'station', 'state'], how='left')
+    # data = data.select(['batu_id', 'cell_id', 'soc', 'vol', 'state', 'times', 'station', 'dt'])  这里dt和station位置换一下
+    data = data.select(['batu_id', 'cell_id', 'soc', 'vol', 'state', 'times', 'dt', 'station'])
+    return data
+
--- a/battery_health/functions/statistic_data/statistic_data_summary.py
+++ b/battery_health/functions/statistic_data/statistic_data_summary.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+from battery_health.functions.statistic_data import statistic_calculation, statistic_data_prep
+
+
+def data_prep(date):
+    data = statistic_data_prep.data_prep_station(date=date)
+    return data
+
+def statistic_summary(date):
+    data = statistic_calculation.get_statistic(date=date, cycles=1)
+
+    return data
\ No newline at end of file
--- a/battery_health/main_process.py
+++ b/battery_health/main_process.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+from battery_health import data_load, configure, udf_collections
+import pyspark.sql.functions as F
+import time
+import pyspark.sql.types as t
+import pandas as pd
+from dateutil.relativedelta import relativedelta
+from pyspark.sql import Window
+from battery_health.functions.data_verification import data_check
+import time
+import requests
+
+
+def empty_pids(date, pid_name_list):
+    struct_time = time.strptime(date, '%Y-%m-%d')
+    date_s = str(int(time.mktime(struct_time))*1000000000)
+    dev_pid = data_load.get_dev_pid(pid_name=pid_name_list, date=date, station_id_list='628421',
+                                    tier=['BatteryCluster', 'BatteryUnit', 'Storage', 'Cell'])
+    dp = dev_pid.toPandas()
+
+    for i in range(10):
+        print('round:', i)
+        dp_ = dp[i::10]
+        dp_list = dp_.pid.to_list()
+        #dp_list = ['11159999938769','11159999938775','11159999938774','11159999952235']
+        s = str(dp_list).replace('[', '%28pid%3D%27').replace(''',''','%27+or+pid%3D%27').replace("'", '')\
+                .replace(' ', '').replace(']', '%27%29+and+time%3D')+date_s
+        headers = {
+            'Accept': 'application/csv',
+        }
+        response = requests.get(
+            # 'https://ts-bp1ue44w0814t8cx9.influxdata.tsdb.aliyuncs.com:8086/query?db=sgool&u=huafeng&p=Huafeng@2022&precision=ms&q=delete+from+data_meas_1114_1d+where+{}'.format(s),
+            'http://192.168.1.99:48088/query?db=sgool&u=yunhe&p=yunhe2020&precision=ms&q=delete+from+data_meas_1114_1d+where+{}'.format(s),
+            headers=headers,
+        )
+        print('round:', i, 'xmei delete finished')
+        response = requests.get(
+            # 'https://ts-bp1ue44w0814t8cx9.influxdata.tsdb.aliyuncs.com:8086/query?db=sgool&u=huafeng&p=Huafeng@2022&precision=ms&q=delete+from+data_meas_1115_1d+where+{}'.format(s),
+            'http://192.168.1.99:48088/query?db=sgool&u=yunhe&p=yunhe2020&precision=ms&q=delete+from+data_meas_1115_1d+where+{}'.format(s),
+            headers=headers,
+        )
+        print('round:', i, 'hnc delete finished')
+
+    print('x')
+
+
+def cal_batu_temp_cons(date):
+    batu_window = Window.partitionBy(['batu_id', 'dt', 'times'])
+    batu_temp = data_load.get_cell_base(date=date, targets=['batu_id', 'temp', 'dt', 'times', 'station'])
+    batu_temp = data_check.check_cell_data(batu_temp, 'temp')
+    batu_temp_cons = batu_temp.withColumn('temp_range', F.max('temp').over(batu_window) -
+                                          F.min('temp').over(batu_window))
+    window_temp = Window.partitionBy(['batu_id', 'dt'])
+    batu_temp_cons = batu_temp_cons.withColumn('temp_range', F.max('temp_range').over(window_temp))
+    batu_temp_cons = batu_temp_cons.select(['batu_id', 'temp_range', 'dt', 'station']).dropDuplicates()
+    batu_temp_cons.write.format('hive').insertInto(configure.batu_tempCons_table, overwrite=True)
+
+
+def sava_to_hive(date):
+    # create_risk_data(date=date)
+    # save_data = create_capacity_soh(date=date)
+    cal_batu_temp_cons(date)
+    save_data = data_load.get_data_general(configure.soh_table, date=date)
+            # save_data = spark.sql("select * from test.decay_data")
+    # health_data = create_health_data(date=date)
+    health_data = data_load.get_data_general(configure.health_data, date=date)
+
+    health_score = data_load.get_data_general(configure.health_score, date=date)
+    health_score = health_score.withColumn('times', F.lit(str(int(time.mktime(time.strptime(date, "%Y-%m-%d"))) * 1000)))
+    # days = [str((pd.to_datetime(date) - relativedelta(days=i)).date()) for i in range(365)]
+    batu_temp_cons = data_load.get_data_general(table=configure.batu_tempCons_table, date=date)\
+        .withColumnRenamed('batu_id', 'dev_id').withColumnRenamed('dt', 'times').withColumnRenamed('temp_range', 'val')
+
+    batu_temp_cons = batu_temp_cons.withColumn('pid_name', F.lit('MaxCellTempRange'))\
+        .withColumn('times', F.unix_timestamp('times', 'yyyy-MM-dd'))\
+        .withColumn('times', F.col('times') * 1000).withColumn('dt', F.lit(date))\
+        .select(['dev_id', 'pid_name', 'times', 'val', 'dt', 'station'])
+
+    influxdb_data = save_data.unionByName(health_data).unionByName(health_score).unionByName(batu_temp_cons)
+
+    pid_data = data_load.get_dev_pid(targets=['pid', 'dev_id', 'pid_name'], date=date,
+                                     tier=['BatteryCluster', 'BatteryUnit', 'Storage', 'Cell'])
+    hive_data = influxdb_data.join(pid_data, on=['dev_id', 'pid_name'], how='left')\
+        .filter(F.col('station').isin(configure.station_list))
+    hive_data = hive_data.withColumn('dt', F.lit(date)).select(['pid', 'times', 'val', 'dt', 'station']).dropna()
+    # 清空要写入的点号当天0点的数据
+    empty_pids(date, configure.pid_name_list)
+    return hive_data
--- a/battery_health/udf_collections.py
+++ b/battery_health/udf_collections.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+from pyspark.sql.functions import pandas_udf, PandasUDFType
+import numpy as np
+import pandas as pd
+import math
+from pyspark.sql import functions as F
+from pyspark.sql.window import Window
+
+@pandas_udf("pid: string, times: string, val: float, dt: string, station: string", PandasUDFType.GROUPED_MAP)
+def fill_time_by_resample(fill_df):
+    """
+    :param fill_df: 必须有的项 ['pid', 'times', 'val', 'dt', 'station', 'freq']
+    :return: ['pid', 'times', 'val', 'dt', 'station']
+    """
+    fill_df.sort_values('times', inplace=True)
+    fill_df.drop_duplicates('times', inplace=True)
+    freq = fill_df.freq.iloc[0]
+    fill_df.index = pd.to_datetime(fill_df.times)
+    fill_df = fill_df.resample(freq, closed='right').ffill()
+    fill_df.bfill(inplace=True)
+    fill_df.drop('times', axis=1, inplace=True)
+    fill_df = fill_df.reset_index()
+    fill_df['times'] = fill_df.times.astype(str)
+    if 'val' not in list(fill_df.columns):
+        fill_df = pd.DataFrame(data=None, columns=['pid', 'times', 'val', 'dt', 'station'])
+    fill_df = fill_df[['pid', 'times', 'val', 'dt', 'station']]
+    return fill_df
+
+
+@pandas_udf("dev_id: string, start_time: string, end_time: string, time_gap: float, soc_diff: float, "
+            "state: string, full_charge: int, station: string", PandasUDFType.GROUPED_MAP)
+def get_state(data):
+    """
+    通过对设备的SOC数据以及功率数据进行分析识别得到该设备的充放电状态，即充放电的开始结束时间、SOC的变化量等信息
+    Args:
+        data: DataFrame [station, dev_id, times, dt, soc, power, full_charge_index, threshold]
+              station: string,　电站名称
+              dev_id: string,　设备ID
+              times: string,　时间
+              dt: string,　日期
+              soc: float,　SoC
+              power: double,　功率
+              full_charge_index: int,　满充满放指标
+              threshold: int, 阈值
+    Returns:
+        dev_state: DataFrame ['dev_id', 'start_time', 'end_time', 'time_gap','soc_diff', 'state', 'full_charge','station']
+                   dev_id: string, 设备ID
+                   dt: string, 日期
+                   start_time: string, 充放电的开始时间
+                   end_time: string, 充放电的结束时间
+                   time_gap: float, 充放电的持续时间
+                   soc_diff: int, 充放电过程中的soc变化
+                   state: string, 充放电状态('charge','discharge')
+                   full_charge: int, 满充满放标志
+                   station: string, 电站名称
+    """
+    # 首先根据SOC数据找出一天内连续变化（连续增加或者连续减小）的时间端
+    data = data.sort_values('times')
+    data.drop_duplicates(inplace=True)
+    data.index = list(range(len(data)))
+    full_charge_index = data.full_charge_index.iloc[0]
+    threshold = data.threshold.iloc[0]
+    station = data.station.iloc[0]
+    dev_id = data.dev_id.iloc[0]
+
+    soc_diff =[i - j for i, j in zip(data.soc.iloc[1:], data.soc.iloc[:-1])] + [0]
+    data['soc_diff'] = soc_diff
+    data.index = list(range(len(data)))
+
+    # 提取soc不为0的数据,对于异常数据（soc_diff=35）暂时在这里过滤掉,soc发生变化时,功率为0也视为异常
+    # non_zero_power = data[abs(data['power_diff']) > 100] # 放电平台状态不用检测，这里不用做了
+    non_zero_data = data[(data.soc_diff != 0) & (abs(data.soc_diff) < 5) & (abs(data.power) > threshold)]
+    #non_zero_data = data[data.soc_diff != 0]
+
+    if len(non_zero_data)<2:
+        print("Empty device: dev_id:{id}, station:{station} ".format(id = dev_id,station=station))
+        dev_state = pd.DataFrame(data={'dev_id': [], 'start_time': [], 'end_time': [], 'time_gap': [],
+                                       'soc_diff': [], 'state': [], 'full_charge': [], 'station': []})
+        return dev_state
+
+    # 通过功率不一样的点切分充放电起止时间
+    change_points = [0]
+    for i in range(len(non_zero_data) - 1):
+        if non_zero_data.soc_diff.iloc[i] * non_zero_data.soc_diff.iloc[i + 1] <= 0:
+            change_points.append(i)
+            change_points.append(i+1)
+    if len(change_points) % 2 == 1:
+        change_points.append(len(non_zero_data) - 1)
+
+    # 对于一个过程中，时间间隔大于30min的数据，先不做处理，而是把两个时间点加入到时间list中，后期通过soc_diff和time_gap进行过滤
+    for x, y in zip(change_points[::2], change_points[1::2]):
+        for i in range(x, y):
+            if (pd.to_datetime(non_zero_data.times.iloc[i + 1]) - pd.to_datetime(non_zero_data.times.iloc[i])).seconds / 60 > 30:
+                change_points.append(i)
+                change_points.append(i+1)
+    change_points.sort()
+
+    # 检查充电起止时间状态('charge'/'discharge'/'stay')是否与这个过程的状态对应,(查找起止时刻的异常点)
+    # 检查并处理，方式：把功率值作为状态进行判断，异常点向内缩
+    for x, y in zip(change_points[::2], change_points[1::2]):
+        std_state = non_zero_data.power.iloc[x+1 : y-1].mean()
+        x_state = non_zero_data.power.iloc[x]
+        y_state = non_zero_data.power.iloc[y]
+        while (x_state * std_state <= 0) & (x+1 <= len(non_zero_data)-1):
+            change_points[change_points.index(x)] += 1
+            x += 1
+            x_state = non_zero_data.power.iloc[x]
+        while (y_state * std_state <= 0) & (y-1 >= 0):
+            change_points[change_points.index(y)] -= 1
+            y -= 1
+            y_state = non_zero_data.power.iloc[y]
+
+    power_change_index = list(non_zero_data.index[change_points])
+
+    # 数据微调(power),这个方式无法处理2022-08-17的最后一次放电情况,(扩张数据)
+    for ind in range(len(power_change_index)):
+        if ind % 2 == 0: #起始点
+            state = data.power.iloc[power_change_index[ind] + 1 : power_change_index[ind+1]].mean()
+            while (data.power.iloc[power_change_index[ind]] * state > 0) & (power_change_index[ind]-1 >= 0):
+                power_change_index[ind] -= 1
+        else: # 结束点
+            state = data.power.iloc[power_change_index[ind-1] + 1 : power_change_index[ind]].mean()
+
+            while (data.power.iloc[power_change_index[ind]] * state > 0) & (power_change_index[ind] + 1 < len(data)): # [power_change_index[ind] + 1]会报下标越界
+                power_change_index[ind] += 1
+
+    # 数据整理，准备return
+    final_df = data.iloc[power_change_index]
+
+    start_time = list(final_df.times.iloc[::2])
+    end_time = list(final_df.times.iloc[1::2])
+
+    time_gap = [round((pd.to_datetime(y) - pd.to_datetime(x)).seconds / 60, 1) for x, y in zip(start_time, end_time)]
+    start_soc = list(final_df.soc.iloc[::2])
+    end_soc = list(final_df.soc.iloc[1::2])
+    state_soc_diff = [x-y for x,y in zip(end_soc,start_soc)]
+    full_charge = [abs(x) // full_charge_index for x in state_soc_diff]
+    state_list = ['charge' if i > 0 else 'discharge' for i in state_soc_diff]
+
+    dev_state = pd.DataFrame(data = {'dev_id': dev_id , 'start_time': start_time, 'end_time': end_time,
+                                     'time_gap': time_gap, 'soc_diff': state_soc_diff, 'state':state_list,
+                                     'full_charge': full_charge,'station': station})
+    dev_state.drop_duplicates(inplace=True)
+    dev_state = dev_state[dev_state['time_gap']>30]
+
+    return dev_state
+
+
+@pandas_udf("dev_id: string, dt: string, hour: string, state: string", PandasUDFType.GROUPED_MAP)
+def get_state_hour(data):
+
+    def str_trans(num):
+        if num<10:
+            return '0'+str(num)
+        else:
+            return  str(num)
+    # 1 应该是 01
+
+    # [dev_id, state_hour, state]
+    state = data['state'].iloc[0]
+    start_time = data['start_time'].iloc[0]
+    end_time = data['end_time'].iloc[0]
+    start_hour = int(start_time[-8:-6])
+    end_hour = int(end_time[-8:-6])
+    hour_list = []
+    dt = []
+    dev_id = data.dev_id.iloc[0]
+    if start_hour > end_hour:
+        hours = [str_trans(i) for i in list(range(start_hour, 24))]
+        dt += len(hours) * [start_time[:10]]
+        hour_list += hours
+        hours = [str_trans(i) for i in list(range(0, end_hour+1))]
+        dt += len(hours) * [end_time[:10]]
+        hour_list += hours
+    else:
+        hours = [str_trans(i) for i in list(range(start_hour, end_hour + 1))]
+        hour_list += hours
+        dt += len(hours) * [end_time[:10]]
+    state_data = pd.DataFrame(data={'dev_id': dev_id, 'dt': dt, 'hour': hour_list, 'state':state})
+    return state_data
+
+
+@pandas_udf("cell_id: string, times: string, vol: double, cur: double, soc: double, qt: double, "
+            "delta_q: double, delta_v: double, station:string, dt: string", PandasUDFType.GROUPED_MAP)
+def calculate_ica(data):
+    data.sort_values('times', inplace=True)
+    data.dropna(axis=0, inplace=True)
+    cell_id = data.cell_id.iloc[0]
+    dt = data.dt.iloc[0]
+    station = data.station.iloc[0]
+    # max_vol_time = data[data.vol == data.vol.max()]['times'].iloc[0]
+    # data = data[data.times <= max_vol_time]
+    qt_list = []
+    count = 0
+    for i in range(len(data)):
+        tmp = data.cur.iloc[i] / 3600 + count
+        qt_list.append(tmp)
+        count = tmp
+    data['qt'] = qt_list
+    data.drop_duplicates(subset=['vol'], keep='last', inplace=True)
+    # 过滤电压变化大于0.02V的记录
+    tmp_data = data.iloc[0]
+    while (data.vol.max() - tmp_data.vol.max()) > 0.01:
+        tt = data[data.vol > (tmp_data.vol.max() + 0.01)]
+        tmp_data = tmp_data.append(tt.iloc[0])
+
+    if len(tmp_data) == len(data.columns):
+        data = pd.DataFrame(data=None, columns=['cell_id', 'times', 'vol', 'cur', 'soc', 'qt', 'delta_q', 'delta_v',
+                                                'station', 'dt'])
+    else:
+        data = pd.DataFrame({'cell_id': [cell_id] * len(tmp_data.times), 'times': list(tmp_data.times),
+                             'vol': list(tmp_data.vol), 'cur': list(tmp_data.cur), 'soc': list(tmp_data.soc),
+                             'qt': list(tmp_data.qt), 'station':station})
+
+        delta_l = []
+        for x, y in zip(data['qt'][1:], data['qt'][:-1]):
+            delta_l.append(y - x)
+        data['delta_q'] = [0.000001] + delta_l
+        delta_v = []
+        for x, y in zip(data.vol[1:], data.vol[:-1]):
+            delta_v.append(x - y)
+        data['delta_v'] = [0.001] + delta_v
+        data['dt'] = dt
+
+    return data
+
+
+@pandas_udf("cell_id: string, dt: string, lithium: int, station:string", PandasUDFType.GROUPED_MAP)
+def analysis_lithium(ica_data):
+    ica_data.index = list(range(len(ica_data)))
+    if ica_data.vol.max() < 3.5:
+        flag = 0
+    else:
+        ica_max = ica_data[ica_data.vol >= 3.5]['delta_q_v'].max()
+        index = ica_data[ica_data.delta_q_v == ica_max].index[0]
+        ica_former = ica_data.loc[index - 1]['delta_q_v']
+        if (index + 1) < len(ica_data):
+            ica_latter = ica_data.loc[index + 1]['delta_q_v']
+            if ((ica_max - ica_former) / ica_former > 0.5) & ((ica_max - ica_latter) / ica_latter > 0.5):
+                flag = 1
+            else:
+                flag = 0
+        else:
+            if (ica_max - ica_former) / ica_former > 1.5:
+                flag = 1
+            else:
+                flag = 0
+    ica_data['lithium'] = flag
+
+    result = ica_data[['cell_id', 'lithium', 'dt', 'station']]
+    result.drop_duplicates(inplace=True)
+    return result
+
+
+@pandas_udf("cell_id: string, dt: string, resistance: double, station:string", PandasUDFType.GROUPED_MAP)
+def calculate_resistance(cell_data):
+    cell_data.sort_values('times', inplace=True)
+    cell_data.dropna(axis=0, inplace=True)
+    vol_list = list(cell_data.vol.unique())
+    if len(vol_list) < 2:
+        cell_resistance = pd.DataFrame(data=None, columns=['cell_id', 'resistance', 'dt', 'station'])
+    else:
+        i=0
+        vol_diff = vol_list[i + 1] - vol_list[i]
+        while vol_diff<0:
+            i = i + 1
+            vol_diff = vol_list[i + 1] - vol_list[i]
+        resistance = vol_diff / abs(cell_data[cell_data.vol == vol_list[i+1]]['cur'].iloc[0])
+        resistance = round(resistance * 1000, 3)
+        cell_data['resistance'] = resistance
+        cell_resistance = cell_data[['cell_id', 'resistance', 'dt', 'station']]
+        cell_resistance.drop_duplicates(inplace=True)
+    return cell_resistance
+
+
+@pandas_udf("batu_id: string, dt: string, vol: float, extra_cap:double", PandasUDFType.GROUPED_MAP)
+def get_model_f(cbd):
+    # version_shtb ,关键的变量: cal, cal_table, df_1, 需要修改的变量: unit_time, std_k, cal_f2的除数
+    unit_time = 10
+    cc = cbd[cbd['grade_vol'] == cbd['vol']].groupby(['cell_id']).apply(lambda x: x[x['times'] == x['times'].max()])
+    cc.sort_values('vol', inplace=True)
+    cc.index = list(range(len(cc)))
+
+    # 通过斜率过滤出截止电压较低的几个单体
+    cc['vol_diff'] = [x-y for x, y in zip(cc.vol.iloc[1:], cc.vol.iloc[:-1])] + [1]
+    cc['k'] =[1] + [(x - y) / 2 for x, y in zip(cc.vol.iloc[1:], cc.vol.iloc[:-1])]
+
+    std_k = 0.0059 # 这个数值现在暂时人为规定一下
+    #std_k = (cc.vol.max()-cc.vol.min()) / (len(cc)/6)
+
+    cc = cc[cc['k']!=0]
+    cc_1 = cc.iloc[:len(cc)//4]
+    cal_f = cc_1[cc_1['k'] > std_k]
+    cal_f2 = cc[cc['vol'] > cal_f.vol.iloc[-1]].iloc[::-len(cc) // 10] # 上海可以这么做，别的电站要调整
+
+    cal = pd.concat([cal_f, cal_f2])
+    cal.sort_values('vol', inplace=True)
+    cal = cal.drop_duplicates()
+
+    # 用于划分计算
+    cell_id_list = cal.cell_id.to_list()
+    start_vol = cal.vol.to_list()
+    end_vol = cal.vol.to_list()[1:] + [cbd[cbd['cell_id'] == cell_id_list[-1]].vol.max()]
+    cal_table = pd.DataFrame(data={'cell_id': cell_id_list, 'start_vol': start_vol, 'end_vol': end_vol})
+
+    # 划分后获取区间内的数据点
+    df_1 = cbd[(cbd['cell_id'] == cal_table.cell_id.iloc[0])]
+    df_1 = df_1[(df_1['vol'] >= cal_table.start_vol.iloc[0]) & (df_1['vol'] < cal_table.end_vol.iloc[0])]
+    for i in range(1, len(cal_table)):
+        df_2 = cbd[cbd['cell_id'] == cal_table.cell_id.iloc[i]]
+        df_2 = df_2[(df_2['vol'] >= cal_table.start_vol.iloc[i]) & (df_2['vol'] < cal_table.end_vol.iloc[i])]
+        df_1 = pd.concat([df_1, df_2])
+
+    # 根据步长和数据点进行计算
+    vol_list = []
+    cap_list = []
+    for i in np.arange(df_1.vol.min(), df_1.vol.max() + 0.001, 0.001):
+        vol_list.append(round(i, 3))
+        tmp_data = abs(df_1[df_1.vol < i]['power'].sum()) / (3600 / unit_time) # xmei用1,  hnc,shtb用10
+        tmp_data = round(tmp_data, 3)
+        cap_list.append(tmp_data)
+
+    batu_id = cbd.batu_id.iloc[0]
+    dt = df_1.dt.iloc[0]
+    lgt = len(vol_list)
+    extra_cap = pd.DataFrame(data={'batu_id': [batu_id] * lgt, 'dt': [dt] * lgt, 'vol': vol_list, 'extra_cap': cap_list})
+
+    return extra_cap
+
+
+@pandas_udf("batu_id: string, cell_id: string, capacity: double, state: string, soc_diff: double, min_cap: double, "
+            "dt: string, station: string, mean_cap: double", PandasUDFType.GROUPED_MAP)
+def get_conf(data):
+    # 计算加权后的平均容量
+    weight_list = []
+    for i in range(len(data)):
+        dis = 0
+        for j in range(len(data)):
+            dis += abs(data.capacity.iloc[i] - data.capacity.iloc[j])
+        weight_list.append(dis)
+    if len(weight_list)==1:
+        weight_list = [1]
+
+    weight_1 = [1/(x/min(weight_list)) for x in weight_list]
+    weight = [x/sum(weight_1) for x in weight_1]
+
+    data['mean_cap'] = sum(x*y for x,y in zip(data.capacity,weight))
+
+    return data
+
+@pandas_udf("station: string, hour: string, pid: string, val: double, times: string, dt: string", PandasUDFType.GROUPED_MAP)
+def statistic_data_adjustment(data):
+    # print("statistic_data_adjustment start-----------------------")
+    if len(data)<2:
+        return pd.DataFrame(data={'station': [], 'hour': [], 'pid': [], 'val': [], 'times': [], 'dt': []})
+    data.sort_values('times', inplace=True)
+    data.drop_duplicates('times', inplace=True)
+    data = data.reset_index()
+    for i in range(1,len(data)//2):
+        if math.isnan(data.iloc[-i].val):
+            data['val'].iloc[-i] = data['val'].iloc[-i-1]
+    return data[['station', 'hour', 'pid', 'val', 'times', 'dt']]
+
+@pandas_udf("cell_id: string, cell_title: string, vol_cons: double, temp_cons: double, cap_cons: double, res_cons: double, dt: string, station: string, numb: array<string>, v1: int, v0: string", PandasUDFType.GROUPED_MAP)
+def fill_health_temp(data):
+    data = data.copy()  # <--- 关键，先复制
+    for i in data[data['temp_cons'].isna()].v1.unique():
+        if len(data[data['v1'] == i]) != len(data[(data['temp_cons'].isna())]):
+            idx = data[(data['temp_cons'].isna()) & (data['v1'] == i)].index
+            mean_val = data.loc[(data['temp_cons'].notna()) & (data['v1'] == i), 'temp_cons'].mean()
+            data.loc[idx, 'temp_cons'] = mean_val
+        else:
+            idx = data[(data['temp_cons'].isna()) & (data['v1'] == i)].index
+            if i == 1:
+                mean_val = data.loc[data['v1'] == i + 1, 'temp_cons'].mean()
+            else:
+                mean_val = data.loc[data['v1'] == i - 1, 'temp_cons'].mean()
+            data.loc[idx, 'temp_cons'] = mean_val
+    return data
+
+@pandas_udf("cell_id: string, cell_title: string, vol_cons: double, temp_cons: double, cap_cons: double, res_cons: double, dt: string, station: string, numb: array<string>, v1: int, v0: string, health_score: double", PandasUDFType.GROUPED_MAP)
+def fill_health_score(data):
+    data = data.copy()  # <--- 关键，先复制
+    for i in data[data['health_score'].isna()].v1.unique():
+        if len(data[data['v1'] == i]) != len(data[(data['health_score'].isna())]):
+            idx = data[(data['health_score'].isna()) & (data['v1'] == i)].index
+            mean_val = data.loc[(data['health_score'].notna()) & (data['v1'] == i), 'health_score'].mean()
+            data.loc[idx, 'health_score'] = mean_val
+        else:
+            idx = data[(data['health_score'].isna()) & (data['v1'] == i)].index
+            if i == 1:
+                mean_val = data.loc[data['v1'] == i + 1, 'health_score'].mean()
+            else:
+                mean_val = data.loc[data['v1'] == i - 1, 'health_score'].mean()
+            data.loc[idx, 'health_score'] = mean_val
+    return data
+
+
+@pandas_udf("cell_id: string, cell_title: string, vol_cons: double, temp_cons: double, cap_cons: double,"
+            " res_cons: double, health_score: double, dt: string, station: string", PandasUDFType.GROUPED_MAP)
+def avg_health_score(data):
+    cell_id = data.cell_id.iloc[0]
+    cell_title = data.cell_title.iloc[0]
+    station = data.station.iloc[0]
+    dt = data.dt.max()
+    vol_cons = data.vol_cons.mean()
+    temp_cons = data.temp_cons.mean()
+    cap_cons = data.cap_cons.mean()
+    res_cons = data.res_cons.mean()
+    health_score = data.health_score.mean()
+    res_data = pd.DataFrame(data={'cell_id': [cell_id], 'cell_title': [cell_title], 'vol_cons': [vol_cons],
+                                  'temp_cons': [temp_cons], 'cap_cons': [cap_cons], 'res_cons': [res_cons],
+                                  'health_score': [health_score], 'dt': [dt], 'station': [station]})
+    return res_data
+
+
+@pandas_udf("batu_id: string, temp_score: double, vol_score: double, std_score: double, dt: string,"
+            "station: string", PandasUDFType.GROUPED_MAP)
+def calculate_batu_end_cons(data):
+    if data.empty:
+        return pd.DataFrame(data=None, columns=[])
+    batu_id = data.batu_id.iloc[0]
+    dt = data.dt.iloc[0]
+    station = data.station.iloc[0]
+    temp_score = data[data.state == 'charge']['temp_score'].mean() * 0.3 \
+                 + data[data.state == 'discharge']['temp_score'].mean() * 0.7
+    vol_score = data[data.state == 'charge']['vol_score'].mean() * 0.3 \
+                + data[data.state == 'discharge']['vol_score'].mean() * 0.7
+    std_score = data[data.state == 'charge']['std_score'].mean() * 0.3 \
+                + data[data.state == 'discharge']['std_score'].mean() * 0.7
+    result = pd.DataFrame(data={'batu_id': [batu_id], 'temp_score': [temp_score], 'vol_score': [vol_score],
+                                'std_score': [std_score], 'dt': [dt], 'station': [station]})
+    return result
+
+
+
--- a/derby.log
+++ b/derby.log
+----------------------------------------------------------------
+Wed Mar 05 10:55:38 GMT+08:00 2025:
+Booting Derby version The Apache Software Foundation - Apache Derby - 10.12.1.1 - (1704137): instance a816c00e-0195-643a-eecd-000032a852b0 
+on database directory C:\project\other\algorithm\algorithm\huafon_battery_health\metastore_db with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@20f2ae70 
+Loaded from file:/C:/tools/spark-2.4.8-bin/jars/derby-10.12.1.1.jar
+java.vendor=Oracle Corporation
+java.runtime.version=1.8.0_271-b09
+user.dir=C:\project\other\algorithm\algorithm\huafon_battery_health
+os.name=Windows 10
+os.arch=amd64
+os.version=10.0
+derby.system.home=null
+Database Class Loader started - derby.database.classpath=''
--- a/id_rsa
+++ b/id_rsa
+-----BEGIN RSA PRIVATE KEY-----
+MIIEowIBAAKCAQEAzxmx71w1rsEiE7Ub1pUcaIOvzKCRwHoj4ao80DKz+Cq3nR32
+NtmhxjxwlMKURjdp/YN+093KrVyC7ry+xw+l+4v+9hwuDa4cu0xJPiwjluuMdf1l
+Coy1NYBI1AdxUqhKd9d6WkZ+5t8E/bqBReVKMLEtNNAhbsDiX5G8ZsmI37aLWgp7
+EonknLbDjX6aInMFWjj6wh2NpzL+jltszNcT9YMtlOyBAWFgdPAP2cWlcKzW7FCB
+DtrzJz8EGG6HVyD3mKTfOrjrsFzWdxMKR2SxiN3IuO6/fVfGELO+aKqAZcIoiA2K
+7oNpKXrI9DXlF2YBIxVOT/tge0Z9Xyn+YVclbwIDAQABAoIBAGOPhLz/ZUdXzZwP
+ywxDrt6HwDHtV2Ri03EfOBEW7vH1B0RfLQh9Y4o7cBvz1vhSl3+qgYQ08cSZ3tAK
+qORln3Pof2+taG5nL6TTLbrX2VwqbOIkKTkDH/JSc7CKy23jPbmTO1y+2y7+NBYO
+XStO6XWzF3p2PDxUOf+bvijj5gs1hu+HtYijq/SWyI/le2gD7AbW0xM7veGHkNZA
+yc5LAN86omeaGwJLV7VzCP4ALRLL2XFFpwA++ZvbEF4PCW7lCOTxBqpOzoTGfmN5
+O2Lj9RbjHNJvMFgmVsy2hK1SFf0Z9fI1s6BWoQ4yI9ZXg4F5hgmr5rtGmD6HeGPM
+fFoR6gECgYEA8Z1Aic+eHBKLBaGRVtwRa3AtRRaoCnfYFD3x8PD3cDpc4G5RUUfI
+Ni/K7Ao7KEJImC9yHNcJov0k8oSGoU7R9fEGaCJjwRf6f6FmNkNCczvgHNHvmTkR
+r4xRt41XZTnrrmLnN+0qVIA+O2auY4AqzFJWnjIq8cRqUy99If2zDYECgYEA225f
+hefClSLXgRbL1yVMyJSfzfbfaQsSSYNguWA6+ex9xDw7Ye/ba36idokVfX+SzUgE
+G4dZfyOUa2+bMtWqxuAj41j3eTAYFFpE4XA5QTgTCzN9THYstm70HF/LqSU9iBpt
+TzPFuQQFPBVTrvSE6HFedphVwi+XDwEkUeDmiu8CgYEAhFobzGIKvl36Aa+rqeUL
+NctRQRNUIcfcedok+lNFeBjAX8COkvO7XNN5WSuRlFAa7CKxY9L32GzLHH40MZC3
+uv25ALo14sR72AZVs0vMzsrxzVfC5DA62+sFqIKoaS79R52uAxjLo1ZMwMVSqfa/
+ewVvpWDd3Wo2xDKzXTdYKgECgYBPE+200hrbqBzF1rNLK5QKTRVyIl/M+UJz37bB
+154pZ0LDr3kvCEOo75AY67ok6g67kBJ64UItgWMBfM0PetT6qtgEHJHCyMREwWtF
+Wy4nBNBIHxwuq//dFws+Fn/MyzDrlaqC+oNs87f3OTBZQqGLKyAB2VA+lOv5ak3u
+fABZXQKBgBOzFEgWzHBk8elTbSotQ0EiTzbCXhYlOpDRB1PTGS0VRsZ1ywoFJ+5x
+LpzmP199dyolaGOVjV0W/iZbDdxSadqH7bCOMa8in+5Hz4qOMmtQD5JpVy7LLNw2
+N8LC2PprTcNmYWYEUP+ikQtOKcLu4jodjL6R9UEXTenO3paNVSy0
+-----END RSA PRIVATE KEY-----
--- a/metastore_db/README_DO_NOT_TOUCH_FILES.txt
+++ b/metastore_db/README_DO_NOT_TOUCH_FILES.txt
+
+# *************************************************************************
+# ***              DO NOT TOUCH FILES IN THIS DIRECTORY!                ***
+# *** FILES IN THIS DIRECTORY AND SUBDIRECTORIES CONSTITUTE A DERBY     ***
+# *** DATABASE, WHICH INCLUDES THE DATA (USER AND SYSTEM) AND THE       ***
+# *** FILES NECESSARY FOR DATABASE RECOVERY.                            ***
+# *** EDITING, ADDING, OR DELETING ANY OF THESE FILES MAY CAUSE DATA    ***
+# *** CORRUPTION AND LEAVE THE DATABASE IN A NON-RECOVERABLE STATE.     ***
+# *************************************************************************
\ No newline at end of file
--- a/metastore_db/db.lck
+++ b/metastore_db/db.lck
--- a/metastore_db/log/README_DO_NOT_TOUCH_FILES.txt
+++ b/metastore_db/log/README_DO_NOT_TOUCH_FILES.txt
+
+# *************************************************************************
+# ***              DO NOT TOUCH FILES IN THIS DIRECTORY!                ***
+# *** FILES IN THIS DIRECTORY ARE USED BY THE DERBY DATABASE RECOVERY   ***
+# *** SYSTEM. EDITING, ADDING, OR DELETING FILES IN THIS DIRECTORY      ***
+# *** WILL CAUSE THE DERBY RECOVERY SYSTEM TO FAIL, LEADING TO          ***
+# *** NON-RECOVERABLE CORRUPT DATABASES.                                ***
+# *************************************************************************
\ No newline at end of file
--- a/metastore_db/log/log.ctrl
+++ b/metastore_db/log/log.ctrl
--- a/metastore_db/log/log1.dat
+++ b/metastore_db/log/log1.dat
--- a/metastore_db/log/logmirror.ctrl
+++ b/metastore_db/log/logmirror.ctrl
--- a/metastore_db/seg0/README_DO_NOT_TOUCH_FILES.txt
+++ b/metastore_db/seg0/README_DO_NOT_TOUCH_FILES.txt
+
+# *************************************************************************
+# ***              DO NOT TOUCH FILES IN THIS DIRECTORY!                ***
+# *** FILES IN THIS DIRECTORY ARE USED BY THE DERBY DATABASE TO STORE   *** 
+# *** USER AND SYSTEM DATA. EDITING, ADDING, OR DELETING FILES IN THIS  ***
+# *** DIRECTORY WILL CORRUPT THE ASSOCIATED DERBY DATABASE AND MAKE     ***
+# *** IT NON-RECOVERABLE.                                               ***
+# *************************************************************************
\ No newline at end of file
--- a/metastore_db/seg0/c10.dat
+++ b/metastore_db/seg0/c10.dat
--- a/metastore_db/seg0/c101.dat
+++ b/metastore_db/seg0/c101.dat
--- a/metastore_db/seg0/c111.dat
+++ b/metastore_db/seg0/c111.dat
--- a/metastore_db/seg0/c121.dat
+++ b/metastore_db/seg0/c121.dat
--- a/metastore_db/seg0/c130.dat
+++ b/metastore_db/seg0/c130.dat
--- a/metastore_db/seg0/c141.dat
+++ b/metastore_db/seg0/c141.dat
--- a/metastore_db/seg0/c150.dat
+++ b/metastore_db/seg0/c150.dat
--- a/metastore_db/seg0/c161.dat
+++ b/metastore_db/seg0/c161.dat
--- a/metastore_db/seg0/c171.dat
+++ b/metastore_db/seg0/c171.dat
--- a/metastore_db/seg0/c180.dat
+++ b/metastore_db/seg0/c180.dat
--- a/metastore_db/seg0/c191.dat
+++ b/metastore_db/seg0/c191.dat
--- a/metastore_db/seg0/c1a1.dat
+++ b/metastore_db/seg0/c1a1.dat
--- a/metastore_db/seg0/c1b1.dat
+++ b/metastore_db/seg0/c1b1.dat
--- a/metastore_db/seg0/c1c0.dat
+++ b/metastore_db/seg0/c1c0.dat
--- a/metastore_db/seg0/c1d1.dat
+++ b/metastore_db/seg0/c1d1.dat
--- a/metastore_db/seg0/c1e0.dat
+++ b/metastore_db/seg0/c1e0.dat
--- a/metastore_db/seg0/c1f1.dat
+++ b/metastore_db/seg0/c1f1.dat
--- a/metastore_db/seg0/c20.dat
+++ b/metastore_db/seg0/c20.dat
--- a/metastore_db/seg0/c200.dat
+++ b/metastore_db/seg0/c200.dat
--- a/metastore_db/seg0/c211.dat
+++ b/metastore_db/seg0/c211.dat
--- a/metastore_db/seg0/c221.dat
+++ b/metastore_db/seg0/c221.dat
--- a/metastore_db/seg0/c230.dat
+++ b/metastore_db/seg0/c230.dat
--- a/metastore_db/seg0/c241.dat
+++ b/metastore_db/seg0/c241.dat
--- a/metastore_db/seg0/c251.dat
+++ b/metastore_db/seg0/c251.dat
--- a/metastore_db/seg0/c260.dat
+++ b/metastore_db/seg0/c260.dat
--- a/metastore_db/seg0/c271.dat
+++ b/metastore_db/seg0/c271.dat
--- a/metastore_db/seg0/c281.dat
+++ b/metastore_db/seg0/c281.dat
--- a/metastore_db/seg0/c290.dat
+++ b/metastore_db/seg0/c290.dat
--- a/metastore_db/seg0/c2a1.dat
+++ b/metastore_db/seg0/c2a1.dat
--- a/metastore_db/seg0/c2b1.dat
+++ b/metastore_db/seg0/c2b1.dat
--- a/metastore_db/seg0/c2c1.dat
+++ b/metastore_db/seg0/c2c1.dat
--- a/metastore_db/seg0/c2d0.dat
+++ b/metastore_db/seg0/c2d0.dat
--- a/metastore_db/seg0/c2e1.dat
+++ b/metastore_db/seg0/c2e1.dat
--- a/metastore_db/seg0/c2f0.dat
+++ b/metastore_db/seg0/c2f0.dat
--- a/metastore_db/seg0/c300.dat
+++ b/metastore_db/seg0/c300.dat
--- a/metastore_db/seg0/c31.dat
+++ b/metastore_db/seg0/c31.dat
--- a/metastore_db/seg0/c311.dat
+++ b/metastore_db/seg0/c311.dat
--- a/metastore_db/seg0/c321.dat
+++ b/metastore_db/seg0/c321.dat
--- a/metastore_db/seg0/c331.dat
+++ b/metastore_db/seg0/c331.dat
--- a/metastore_db/seg0/c340.dat
+++ b/metastore_db/seg0/c340.dat
--- a/metastore_db/seg0/c351.dat
+++ b/metastore_db/seg0/c351.dat
--- a/metastore_db/seg0/c361.dat
+++ b/metastore_db/seg0/c361.dat
--- a/metastore_db/seg0/c371.dat
+++ b/metastore_db/seg0/c371.dat
--- a/metastore_db/seg0/c380.dat
+++ b/metastore_db/seg0/c380.dat
--- a/metastore_db/seg0/c391.dat
+++ b/metastore_db/seg0/c391.dat
--- a/metastore_db/seg0/c3a1.dat
+++ b/metastore_db/seg0/c3a1.dat
--- a/metastore_db/seg0/c3b1.dat
+++ b/metastore_db/seg0/c3b1.dat
--- a/metastore_db/seg0/c3c0.dat
+++ b/metastore_db/seg0/c3c0.dat
--- a/metastore_db/seg0/c3d1.dat
+++ b/metastore_db/seg0/c3d1.dat
--- a/metastore_db/seg0/c3e1.dat
+++ b/metastore_db/seg0/c3e1.dat
--- a/metastore_db/seg0/c3f1.dat
+++ b/metastore_db/seg0/c3f1.dat
--- a/metastore_db/seg0/c400.dat
+++ b/metastore_db/seg0/c400.dat
--- a/metastore_db/seg0/c41.dat
+++ b/metastore_db/seg0/c41.dat
--- a/metastore_db/seg0/c411.dat
+++ b/metastore_db/seg0/c411.dat
--- a/metastore_db/seg0/c421.dat
+++ b/metastore_db/seg0/c421.dat
--- a/metastore_db/seg0/c430.dat
+++ b/metastore_db/seg0/c430.dat
--- a/metastore_db/seg0/c441.dat
+++ b/metastore_db/seg0/c441.dat
--- a/metastore_db/seg0/c451.dat
+++ b/metastore_db/seg0/c451.dat
--- a/metastore_db/seg0/c461.dat
+++ b/metastore_db/seg0/c461.dat
--- a/metastore_db/seg0/c470.dat
+++ b/metastore_db/seg0/c470.dat
--- a/metastore_db/seg0/c481.dat
+++ b/metastore_db/seg0/c481.dat
--- a/metastore_db/seg0/c490.dat
+++ b/metastore_db/seg0/c490.dat
--- a/metastore_db/seg0/c4a1.dat
+++ b/metastore_db/seg0/c4a1.dat
--- a/metastore_db/seg0/c4b0.dat
+++ b/metastore_db/seg0/c4b0.dat
--- a/metastore_db/seg0/c4c1.dat
+++ b/metastore_db/seg0/c4c1.dat
--- a/metastore_db/seg0/c4d1.dat
+++ b/metastore_db/seg0/c4d1.dat
--- a/metastore_db/seg0/c4e1.dat
+++ b/metastore_db/seg0/c4e1.dat
--- a/metastore_db/seg0/c4f0.dat
+++ b/metastore_db/seg0/c4f0.dat
--- a/metastore_db/seg0/c501.dat
+++ b/metastore_db/seg0/c501.dat
--- a/metastore_db/seg0/c51.dat
+++ b/metastore_db/seg0/c51.dat
--- a/metastore_db/seg0/c510.dat
+++ b/metastore_db/seg0/c510.dat
--- a/metastore_db/seg0/c521.dat
+++ b/metastore_db/seg0/c521.dat
--- a/metastore_db/seg0/c530.dat
+++ b/metastore_db/seg0/c530.dat
--- a/metastore_db/seg0/c541.dat
+++ b/metastore_db/seg0/c541.dat
--- a/metastore_db/seg0/c550.dat
+++ b/metastore_db/seg0/c550.dat
--- a/metastore_db/seg0/c561.dat
+++ b/metastore_db/seg0/c561.dat
--- a/metastore_db/seg0/c570.dat
+++ b/metastore_db/seg0/c570.dat
--- a/metastore_db/seg0/c581.dat
+++ b/metastore_db/seg0/c581.dat
--- a/metastore_db/seg0/c590.dat
+++ b/metastore_db/seg0/c590.dat
--- a/metastore_db/seg0/c5a1.dat
+++ b/metastore_db/seg0/c5a1.dat
--- a/metastore_db/seg0/c5b0.dat
+++ b/metastore_db/seg0/c5b0.dat
--- a/metastore_db/seg0/c5c1.dat
+++ b/metastore_db/seg0/c5c1.dat
--- a/metastore_db/seg0/c5d0.dat
+++ b/metastore_db/seg0/c5d0.dat
--- a/metastore_db/seg0/c5e1.dat
+++ b/metastore_db/seg0/c5e1.dat
--- a/metastore_db/seg0/c5f0.dat
+++ b/metastore_db/seg0/c5f0.dat
--- a/metastore_db/seg0/c60.dat
+++ b/metastore_db/seg0/c60.dat
--- a/metastore_db/seg0/c601.dat
+++ b/metastore_db/seg0/c601.dat
--- a/metastore_db/seg0/c610.dat
+++ b/metastore_db/seg0/c610.dat
--- a/metastore_db/seg0/c621.dat
+++ b/metastore_db/seg0/c621.dat
--- a/metastore_db/seg0/c630.dat
+++ b/metastore_db/seg0/c630.dat
--- a/metastore_db/seg0/c641.dat
+++ b/metastore_db/seg0/c641.dat
--- a/metastore_db/seg0/c650.dat
+++ b/metastore_db/seg0/c650.dat
--- a/metastore_db/seg0/c661.dat
+++ b/metastore_db/seg0/c661.dat
--- a/metastore_db/seg0/c670.dat
+++ b/metastore_db/seg0/c670.dat
--- a/metastore_db/seg0/c681.dat
+++ b/metastore_db/seg0/c681.dat
--- a/metastore_db/seg0/c690.dat
+++ b/metastore_db/seg0/c690.dat
--- a/metastore_db/seg0/c6a1.dat
+++ b/metastore_db/seg0/c6a1.dat
--- a/metastore_db/seg0/c6b0.dat
+++ b/metastore_db/seg0/c6b0.dat
--- a/metastore_db/seg0/c6c1.dat
+++ b/metastore_db/seg0/c6c1.dat
--- a/metastore_db/seg0/c6d0.dat
+++ b/metastore_db/seg0/c6d0.dat
--- a/metastore_db/seg0/c6e1.dat
+++ b/metastore_db/seg0/c6e1.dat
--- a/metastore_db/seg0/c6f0.dat
+++ b/metastore_db/seg0/c6f0.dat
--- a/metastore_db/seg0/c701.dat
+++ b/metastore_db/seg0/c701.dat
--- a/metastore_db/seg0/c71.dat
+++ b/metastore_db/seg0/c71.dat
--- a/metastore_db/seg0/c711.dat
+++ b/metastore_db/seg0/c711.dat
--- a/metastore_db/seg0/c721.dat
+++ b/metastore_db/seg0/c721.dat
--- a/metastore_db/seg0/c731.dat
+++ b/metastore_db/seg0/c731.dat
--- a/metastore_db/seg0/c741.dat
+++ b/metastore_db/seg0/c741.dat
--- a/metastore_db/seg0/c751.dat
+++ b/metastore_db/seg0/c751.dat
--- a/metastore_db/seg0/c761.dat
+++ b/metastore_db/seg0/c761.dat
--- a/metastore_db/seg0/c771.dat
+++ b/metastore_db/seg0/c771.dat
--- a/metastore_db/seg0/c781.dat
+++ b/metastore_db/seg0/c781.dat
--- a/metastore_db/seg0/c791.dat
+++ b/metastore_db/seg0/c791.dat
--- a/metastore_db/seg0/c7a1.dat
+++ b/metastore_db/seg0/c7a1.dat
--- a/metastore_db/seg0/c7b1.dat
+++ b/metastore_db/seg0/c7b1.dat
--- a/metastore_db/seg0/c7c1.dat
+++ b/metastore_db/seg0/c7c1.dat
--- a/metastore_db/seg0/c7d1.dat
+++ b/metastore_db/seg0/c7d1.dat
--- a/metastore_db/seg0/c7e1.dat
+++ b/metastore_db/seg0/c7e1.dat
--- a/metastore_db/seg0/c7f1.dat
+++ b/metastore_db/seg0/c7f1.dat
--- a/metastore_db/seg0/c801.dat
+++ b/metastore_db/seg0/c801.dat
--- a/metastore_db/seg0/c81.dat
+++ b/metastore_db/seg0/c81.dat
--- a/metastore_db/seg0/c811.dat
+++ b/metastore_db/seg0/c811.dat
--- a/metastore_db/seg0/c821.dat
+++ b/metastore_db/seg0/c821.dat
--- a/metastore_db/seg0/c831.dat
+++ b/metastore_db/seg0/c831.dat
--- a/metastore_db/seg0/c840.dat
+++ b/metastore_db/seg0/c840.dat
--- a/metastore_db/seg0/c851.dat
+++ b/metastore_db/seg0/c851.dat
--- a/metastore_db/seg0/c860.dat
+++ b/metastore_db/seg0/c860.dat
--- a/metastore_db/seg0/c871.dat
+++ b/metastore_db/seg0/c871.dat
--- a/metastore_db/seg0/c880.dat
+++ b/metastore_db/seg0/c880.dat
--- a/metastore_db/seg0/c891.dat
+++ b/metastore_db/seg0/c891.dat
--- a/metastore_db/seg0/c8a0.dat
+++ b/metastore_db/seg0/c8a0.dat
--- a/metastore_db/seg0/c8b1.dat
+++ b/metastore_db/seg0/c8b1.dat
--- a/metastore_db/seg0/c8c1.dat
+++ b/metastore_db/seg0/c8c1.dat
--- a/metastore_db/seg0/c8d1.dat
+++ b/metastore_db/seg0/c8d1.dat
--- a/metastore_db/seg0/c8e1.dat
+++ b/metastore_db/seg0/c8e1.dat
--- a/metastore_db/seg0/c8f1.dat
+++ b/metastore_db/seg0/c8f1.dat
--- a/metastore_db/seg0/c90.dat
+++ b/metastore_db/seg0/c90.dat
--- a/metastore_db/seg0/c901.dat
+++ b/metastore_db/seg0/c901.dat
--- a/metastore_db/seg0/c911.dat
+++ b/metastore_db/seg0/c911.dat
--- a/metastore_db/seg0/c920.dat
+++ b/metastore_db/seg0/c920.dat
--- a/metastore_db/seg0/c931.dat
+++ b/metastore_db/seg0/c931.dat
--- a/metastore_db/seg0/c940.dat
+++ b/metastore_db/seg0/c940.dat
--- a/metastore_db/seg0/c951.dat
+++ b/metastore_db/seg0/c951.dat
--- a/metastore_db/seg0/c960.dat
+++ b/metastore_db/seg0/c960.dat
--- a/metastore_db/seg0/c971.dat
+++ b/metastore_db/seg0/c971.dat
--- a/metastore_db/seg0/c981.dat
+++ b/metastore_db/seg0/c981.dat
--- a/metastore_db/seg0/c990.dat
+++ b/metastore_db/seg0/c990.dat
--- a/metastore_db/seg0/c9a1.dat
+++ b/metastore_db/seg0/c9a1.dat
--- a/metastore_db/seg0/c9b1.dat
+++ b/metastore_db/seg0/c9b1.dat
--- a/metastore_db/seg0/c9c0.dat
+++ b/metastore_db/seg0/c9c0.dat
--- a/metastore_db/seg0/c9d1.dat
+++ b/metastore_db/seg0/c9d1.dat
--- a/metastore_db/seg0/c9e0.dat
+++ b/metastore_db/seg0/c9e0.dat
--- a/metastore_db/seg0/c9f1.dat
+++ b/metastore_db/seg0/c9f1.dat
--- a/metastore_db/seg0/ca01.dat
+++ b/metastore_db/seg0/ca01.dat
--- a/metastore_db/seg0/ca1.dat
+++ b/metastore_db/seg0/ca1.dat
--- a/metastore_db/seg0/ca11.dat
+++ b/metastore_db/seg0/ca11.dat
--- a/metastore_db/seg0/ca21.dat
+++ b/metastore_db/seg0/ca21.dat
--- a/metastore_db/seg0/cb1.dat
+++ b/metastore_db/seg0/cb1.dat
--- a/metastore_db/seg0/cc0.dat
+++ b/metastore_db/seg0/cc0.dat
--- a/metastore_db/seg0/cd1.dat
+++ b/metastore_db/seg0/cd1.dat
--- a/metastore_db/seg0/ce1.dat
+++ b/metastore_db/seg0/ce1.dat
--- a/metastore_db/seg0/cf0.dat
+++ b/metastore_db/seg0/cf0.dat
--- a/metastore_db/service.properties
+++ b/metastore_db/service.properties
+#C:\project\other\algorithm\algorithm\huafon_battery_health\metastore_db
+# ********************************************************************
+# ***                Please do NOT edit this file.                 ***
+# *** CHANGING THE CONTENT OF THIS FILE MAY CAUSE DATA CORRUPTION. ***
+# ********************************************************************
+#Tue Mar 04 19:19:56 GMT+08:00 2025
+SysschemasIndex2Identifier=225
+SyscolumnsIdentifier=144
+SysconglomeratesIndex1Identifier=49
+SysconglomeratesIdentifier=32
+SyscolumnsIndex2Identifier=177
+SysschemasIndex1Identifier=209
+SysconglomeratesIndex3Identifier=81
+SystablesIndex2Identifier=129
+SyscolumnsIndex1Identifier=161
+derby.serviceProtocol=org.apache.derby.database.Database
+SysschemasIdentifier=192
+derby.storage.propertiesId=16
+SysconglomeratesIndex2Identifier=65
+derby.serviceLocale=zh_CN
+SystablesIdentifier=96
+SystablesIndex1Identifier=113
+#--- last line, don't put anything after this line ---
--- a/mission.py
+++ b/mission.py
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+from battery_health.functions.state_recognition import state_data_prep, state_recognition, capacity_calculation
+from battery_health.functions.health_score import health_score_data_prep, health_data_summary
+from battery_health.functions.statistic_data import statistic_data_prep, statistic_calculation
+from battery_health.functions.derive import data_summary
+from battery_health import configure
+from battery_health import main_process
+from battery_health import data_repair
+import time
+import argparse
+import random
+from datetime import datetime, timedelta
+from battery_health import spark
+
+"""
+这里划分方法的依据是：流程上的模块
+"""
+
+
+def execute_mission(mode, date):
+    # 阶段一：获取充放电状态
+    # 基础数据准备
+    if mode == 'dev_data':
+        data = state_data_prep.data_prep(date)
+        data.write.mode('overwrite').insertInto(configure.dev_table, overwrite=True)
+    # 设备状态识别
+    elif mode == 'dev_state':
+        state = state_recognition.calculate_state(date)
+        state.write.mode('overwrite').insertInto(configure.state_table, overwrite=True)
+    # 设备容量计算
+    elif mode == 'dev_capacity':
+        capacity = capacity_calculation.calculate_dev_capacity(date)
+        capacity.write.mode('overwrite').insertInto(configure.cap_table, overwrite=True)
+
+    # 阶段二: part_1: 健康分计算
+    # 预处理，cell_base 模块
+    elif mode == 'cell_base':
+        health_score_data_prep.get_cell_base(date)
+        # 写数据需要分电站写入，数据量太大会引发GC的问题
+    # 健康分（一致性）
+    elif mode == 'cell_health':
+        score = health_data_summary.health_score(date)
+        score.write.mode('overwrite').insertInto(configure.cell_health_1, overwrite=True)
+    # 微短路
+    elif mode == 'cell_circuit':
+        circuit = health_data_summary.cal_micro_circuit(date)
+        circuit.write.mode('overwrite').insertInto(configure.cell_circuit_table, overwrite=True)
+    # 析锂
+    elif mode == 'cell_lithium':
+        lithium = health_data_summary.cal_lithium(date)
+        lithium.write.mode('overwrite').insertInto(configure.cell_lithium_table, overwrite=True)
+    # 内阻
+    elif mode == 'cell_resistance':
+        resistance = health_data_summary.cal_resistance(date)
+        resistance.write.mode('overwrite').insertInto(configure.cell_resistance_table, overwrite=True)
+    # 单体容量
+    elif mode == 'cell_capacity':
+        cell_capacity = health_data_summary.cal_cell_capacity(date)
+        cell_capacity.write.mode('overwrite').insertInto(configure.cell_capacity_table, overwrite=True)
+        # 单体容量置信分，暂时不用
+        # cell_capacity_conf = data_summary.cal_cell_capacity_conf(date, cycles=3)
+        # cell_capacity_conf.write.mode('overwrite').insertInto(configure.cell_capacity_conf_table, overwrite=True)
+    # 电池运维容量数据
+    elif mode == 'capacity_estimate':
+        cell_capacity = health_data_summary.capacity_estimate(date)
+        # cc = cell_capacity.toPandas()
+        # cell_total = 0
+        # for i in configure.station_list:
+        #     cell_total += configure.general_map[i]['details']['cell_total']
+        # if len(cc) == cell_total:
+        #     cell_capacity.write.format('hive').insertInto(configure.battery_operation_table, overwrite=True)
+        cell_capacity.write.format('hive').insertInto(configure.battery_operation_table, overwrite=True)
+    # 单体健康分
+    elif mode == 'cell_health_score':
+        cell_health_score = health_data_summary.calculate_cell_health_score(date, cycles=configure.cell_health_range)
+        print("start insertInto cell_health_table")
+        cell_health_score.write.mode('append').insertInto(configure.cell_health_table, overwrite=False)  # 单体健康分
+    # 电池单元每日健康分
+    elif mode == 'batu_health_score':
+        batu_daily_score = health_data_summary.calculate_batu_daily_health_score(date)
+        # batu_daily_score.write.mode('overwrite').insertInto(configure.daily_batu_health_table, overwrite=True)
+        # 下面的 电池单元健康分 需要查询7天的电池单元每日健康分，所以这边改为追加而不是覆盖
+        batu_daily_score.write.mode('append').insertInto(configure.daily_batu_health_table, overwrite=False)
+        print("电池单元每日健康分运行完毕")
+    # 电池单元健康分
+        #需要查询包含输入日期在内的前7天的电池单元每日健康分然后计算出电池单元健康分
+        batu_score = health_data_summary.calculate_batu_health_score(date)
+        batu_score.write.mode('overwrite').insertInto(configure.batu_health_table, overwrite=True)
+        print("电池单元健康分运行完毕")
+    # 健康分汇总, 写入结果表 "test.health_score"
+    elif mode == 'health_score':
+        # 查询 单体健康分 cell_health_table 和 电池单元健康分 batu_health_table 的数据 进行计算
+        health_data = health_data_summary.score_summary(date)
+        health_data.write.format('hive').insertInto(configure.health_score, overwrite=True)
+
+    # 阶段二: part_2: 统计数据计算(详情页)
+    # 预处理
+    elif mode == 'statistic_mid':
+        data = statistic_data_prep.data_prep_station(date)
+
+        # 模拟生成1000条数据，字段与dwb_statistic_interm_dd表结构一致
+        # rows = []
+        # base_date = datetime.strptime(date, '%Y-%m-%d')
+        # for i in range(10000):
+        #     batu_id = f"batu_{random.randint(1, 10)}"
+        #     cell_id = f"cell_{random.randint(1, 100)}"
+        #     soc = round(random.uniform(0, 100), 2)
+        #     vol = round(random.uniform(2.5, 4.2), 3)
+        #     state = random.choice(["充电", "放电", "静置"])
+        #     # 随机时间戳
+        #     random_minutes = random.randint(0, 1439)
+        #     times = (base_date + timedelta(minutes=random_minutes)).strftime('%Y-%m-%d %H:%M:%S')
+        #     dt = date
+        #     station = f"station_{random.randint(1, 5)}"
+        #     rows.append((batu_id, cell_id, soc, vol, state, times, dt, station))
+        # columns = ["batu_id", "cell_id", "soc", "vol", "state", "times", "dt", "station"]
+        # data = spark.createDataFrame(rows, columns)
+
+        print("开始写入prod.dwb_statistic_interm_dd表-------")
+        data.write.format('hive').insertInto(configure.statistic_mid_table, overwrite=True)
+        # data.write.mode('append').insertInto(configure.statistic_mid_table, overwrite=False)
+        print("结束-------")
+    # 计算详情页数据, 写入结果表 "test.health_data"
+    elif mode == 'health_data':
+        data = statistic_calculation.get_statistic(date=date, cycles=1)
+        data.write.format('hive').insertInto(configure.health_data, overwrite=True)
+
+    # 阶段二：part_3: SOH预测
+    # SOH相关数据, 写入结果表, "test.decay_data"
+    elif mode == 'decay_data':
+        # 这边也需要改为追加，因为电池单元每日健康分中需要查询前7天的SOH数据
+        data = data_summary.calculate_derive_index(date=date)
+        data.write.format('hive').insertInto(configure.soh_table, overwrite=True)
+
+    # 阶段三：数据汇总
+    elif mode == 'summary':
+        hive_data = main_process.sava_to_hive(date)
+        hive_data.write.format('hive').insertInto(configure.influxdb_table, overwrite=True)
+
+    # 数据修复
+    elif mode == 'data_repair':
+        data_repair.statistic_repair(date, station='hnc')
+
+
+HELP_STR = "nothing"
+CHOICES = ['dev_data',              #数据写入成功
+           'dev_state',             #数据写入成功
+           'dev_capacity',          #数据写入成功
+           'cell_base',             #数据写入成功    仅写入hnc电站的数据，以下模块也一样
+           'cell_health',           #数据写入成功
+           'cell_circuit',          #数据写入成功    
+           'cell_lithium',          #数据写入成功
+           'cell_resistance',       #数据写入成功
+           'cell_capacity',         #数据写入成功
+           'capacity_estimate',     #数据写入成功     数据量与cell_capacity的数据量一致
+           'cell_health_score',     #数据写入成功
+           'batu_health_score',     #数据写入成功     这里跑出来soh和health_score为null，具体原因已记录
+           'health_score',
+           'statistic_mid',         #数据写入成功
+           'health_data',           #数据写入成功
+           'decay_data',            #数据写入成功
+           'summary',               #数据写入成功
+           'data_repair']  
+
+
+if __name__ == '__main__':
+    # 添加命令行参数
+    # arg_parser = argparse.ArgumentParser()
+    # arg_parser.add_argument('-m', '--mode', help=HELP_STR, choices=CHOICES, default='sava_data')
+    # arg_parser.add_argument('-d', '--date', help='the date of mission start, for example: 2020-01-01')
+    # args = arg_parser.parse_args()
+    # print('*' * 20 + 'Mission start: {mission}, date: {date}'.format(mission=args.mode, date=args.date) + '*' * 20)
+    # st = time.time()
+    # execute_mission(args.mode, args.date)
+    # et = time.time()
+    # print('*' * 20 + 'The mission {mission} cost {cost} '.format(mission=args.mode, cost=et - st) + '*' * 20)
+    execute_mission('decay_data', date='2024-10-15')
--- a/readme.md
+++ b/readme.md
+## 电池健康工程代码说明 <br>
+
+## 程序执行方式
+1.spark 执行方式： 
+spark-submit <br>
+--master yarn <br>
+--deploy-mode [cluster/local] <br>
+--driver-memory [2g] <br>
+--executor-memory [2g] <br>
+--num-executors [exevutor-number] <br>
+--conf spark.pyspark.python=[python-path] <br>
+--conf spark.pyspark.driver.python=[python-path] <br>
+--py-files [project-path] hdfs:[main.py] <br>
+-m [mode] <br>
+-d [date] <br>
+
+2.python执行方式：
+python3 [main.py] -m [mode] -d [date] <br>
+
+## 程序功能说明
+工程的目录树形式如下： <br>
+huafon_battery_health/ <br>
+├── battery_health <br>
+│   ├── configure.py <br>
+│   ├── data_load.py <br>
+│   ├── functions <br>
+│   │   ├── data_verification <br>
+│   │   │   ├── adjustment.py <br>
+│   │   │   ├── data_check.py <br>
+│   │   │   └── __init__.py <br>
+│   │   ├── derive <br>
+│   │   │   ├── data_summary.py <br>
+│   │   │   ├── derive.py <br>
+│   │   │   └── __init__.py <br>
+│   │   ├── health_score <br>
+│   │   │   ├── data_prep.py <br>
+│   │   │   ├── data_summary.py <br>
+│   │   │   ├── health_data_summary.py <br>
+│   │   │   ├── health_score_data_prep.py <br>
+│   │   │   ├── __init__.py <br>
+│   │   │   └── score_data_prep.py <br>
+│   │   ├── __init__.py <br>
+│   │   ├── state_recognition <br>
+│   │   │   ├── capacity_calculation.py <br>
+│   │   │   ├── data_prep.py <br>
+│   │   │   ├── __init__.py <br>
+│   │   │   ├── state_data_prep.py <br>
+│   │   │   └── state_recognition.py <br>
+│   │   └── statistic_data <br>
+│   │       ├── data_prep.py <br>
+│   │       ├── data_summary.py <br>
+│   │       ├── __init__.py <br>
+│   │       ├── statistic_calculation.py <br>
+│   │       ├── statistic_data_prep.py <br>
+│   │       ├── statistic_data.py <br>
+│   │       └── statistic_data_summary.py <br>
+│   ├── __init__.py <br>
+│   ├── main_process.py <br>
+│   └── udf_collections.py <br>
+└── mission.py <br>
+
+根目录下存在目录battery_health以及文件mission.py，其中mission.py作为程序的主入口，控制执行不同的任务；
+battery_health目录下存在目录functions以及文件configure.py,data_load.py,udf_collections.py,__init__.py等，
+configure.py文件中，记录了不同电站的特有信息，例如装机量、单体数量、投产时间等等；还记录了一些公有的信息，例如中间结果
+的表信息；data_load.py是数据加载相关的函数；udf_collections.py是自定义函数合集；__init__.py是配置spark信息并生成
+spark session的脚本，spark相关的知识请自行补充。
+<br>
+<br>
+functions目录下存放了电池健康系统功能的实现方法，可分为5个模块，分别为数据校验（data_verification）、充放电状态识别（state_recognition）、
+数据统计详情（statistic_data）、健康分模块（health_score）、指标衍生（derive）。每个模块的组成内容包括数据准备、功能计算这两个部分，具体实现
+会根据实际情况拆分为若干个脚本。<br>
+
+数据校验模块中包括数据合理性检查脚本（data_check.py）以及数据调整模块（adjustment.py），数据合理性检查是对SoC数据，电压数据，温度数据等的检查；
+数据调整模块则是量测数据的处理。<br>
+
+充放电状态识别模块是分析识别储能设备充电/放电过程，包括充电/放电的开始、结束时间，充放电过程中的电能变化、SoC变化。state_data_prep.py会将电池单元/电池簇等不同层级设备的电压、电流、功率、SoC等数据进行处理成DataFrame形式；state_recognition.py则是基于设备的数据（功率以及SoC）进行分析，得到设备充放电起止时刻、充放电时长、SoC变化量、电量变化量等数据；capacity_calculation.py会通过识别的结果计算电池单元/电池簇的容量，该过程存在系统误差（后期需要进行优化）。 <br>
+
+数据详情统计模块聚焦于充放电末端时刻电池一致性的情况，statistic_data_prep.py会处理每个电池单元以及下辖的所有单体电池的数据，将其整合为DataFrame，并对其进行在时间进行扩张，得到等频率的采样数据；statistic_calculation.py会根据充放电识别的结果，过滤得到充放电末端时刻的电池数据并基于该部分数据计算相关的指标；statistic_data_summary.py会对计算得到的指标进行加工处理并存入相应的数据库。 <br>
+
+健康分模块则是量化评估电池单元/单体电池健康状态，在量化健康状态时，会基于电池的一致性进行打分，尤其是电压一致性、温度一致性、容量一致性等等。health_data_summary.py会分别对电池单元/单体电池在不同一致性进行打分并最后归总，对单体电池进行量化时，会对其析锂情况、内短路情况进行分析；health_data_summary.py则是对健康分进行加工并存入相关数据库 <br>
+
+衍生指标模块则是基于前述模块的计算结果进而汇总计算最终结果。 <br>
+
+上述各个模块的具体实现思路参考《电池健康详细设计文档》https://z9ouaq0lze.feishu.cn/wiki/OlQ8wVtr9iIwqNkRSTHcXGsfnnb?from=from_copylink <br>
+
+
+
--- a/requirements.txt
+++ b/requirements.txt
+# pip install pyspark
+# pip install setuptools
+
+pip install pyspark -i https://mirrors.aliyun.com/pypi/simple/
+pip install findspark -i https://mirrors.aliyun.com/pypi/simple/
+pip install pandas==1.1.4 -i https://mirrors.aliyun.com/pypi/simple/
+
+pip install pyarrow==0.14.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+pip install numpy==1.19.2 -i https://mirrors.aliyun.com/pypi/simple/
+pip install py4j==0.10.9.1 -i https://mirrors.aliyun.com/pypi/simple/
+pip install Cython==0.29.21 -i https://mirrors.aliyun.com/pypi/simple/
+pip install Pillow==8.4.0 -i https://mirrors.aliyun.com/pypi/simple/
+pip install scikit-learn==0.24.2 -i https://mirrors.aliyun.com/pypi/simple/
+pip install scipy==1.5.4 -i https://mirrors.aliyun.com/pypi/simple/
+
+
+pip install requests -i https://mirrors.aliyun.com/pypi/simple/