1.1.3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 读取数据集
data = pd.read_csv('credit_data.csv')
data
CustomerID | Name | Age | Income | LoanAmount | LoanTerm | CreditScore | Default | TransactionHistory | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | Customer_1 | 62.0 | 9021.0 | 12200 | 13 | 808 | 0 | [{"amount": 795, "date": "2023-09-06"}, {"amou... |
1 | 2 | Customer_2 | 65.0 | 18187.0 | 29983 | 46 | 378 | 0 | [{"amount": 865, "date": "2023-02-10"}, {"amou... |
2 | 3 | Customer_3 | 18.0 | 14912.0 | 27265 | 35 | 819 | 0 | [{"amount": 398, "date": "2023-09-10"}, {"amou... |
3 | 4 | Customer_4 | 21.0 | 6517.0 | 9870 | 11 | 823 | 0 | [{"amount": 217, "date": "2023-04-26"}, {"amou... |
4 | 5 | Customer_5 | NaN | 10629.0 | 34761 | 44 | 614 | 0 | [{"amount": 520, "date": "2023-03-08"}, {"amou... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
995 | 996 | Customer_996 | 44.0 | 10647.0 | 48751 | 50 | 567 | 0 | [{"amount": 719, "date": "2023-07-11"}, {"amou... |
996 | 997 | Customer_997 | 52.0 | 18029.0 | 29745 | 42 | 597 | 0 | [{"amount": 534, "date": "2023-07-20"}, {"amou... |
997 | 998 | Customer_998 | 35.0 | 11842.0 | 26814 | 24 | 449 | 0 | [{"amount": 807, "date": "2023-04-14"}, {"amou... |
998 | 999 | Customer_999 | 53.0 | 19134.0 | 25761 | 28 | 391 | 0 | [{"amount": 370, "date": "2023-04-13"}, {"amou... |
999 | 1000 | Customer_1000 | 25.0 | 2819.0 | 48398 | 40 | 381 | 0 | [{"amount": 135, "date": "2023-05-30"}, {"amou... |
1000 rows × 9 columns
# 1. 数据完整性审核
missing_values = data.isna().sum() #数据缺失值统计 2分
duplicate_values = data.duplicated() #数据重复值统计 2分
# 输出结果
print("缺失值统计:")
print(missing_values)
print("重复值统计:")
print(duplicate_values)
缺失值统计:
CustomerID 0
Name 0
Age 1
Income 1
LoanAmount 0
LoanTerm 0
CreditScore 0
Default 0
TransactionHistory 0
dtype: int64
重复值统计:
0
# 2. 数据合理性审核
data['is_age_valid'] = data['Age'].between(18, 70) #Age数据的合理性审核 2分
data['is_income_valid'] = data['Income'] > 2000 #Income数据的合理性审核 2分
data['is_loan_amount_valid'] = data['LoanAmount'] < (data['Income'] * 5) #LoanAmount数据的合理性审核 2分
data['is_credit_score_valid'] = data['CreditScore'].between(300, 850) #CreditScore数据的合理性审核 2分
# 合理性检查结果
validity_checks = data[['is_age_valid', 'is_income_valid', 'is_loan_amount_valid', 'is_credit_score_valid']].all(axis=1)
data['is_valid'] = validity_checks
# 输出结果
print("数据合理性检查:")
print(data[['is_age_valid', 'is_income_valid', 'is_loan_amount_valid', 'is_credit_score_valid', 'is_valid']].describe())
数据合理性检查:
is_age_valid is_income_valid is_loan_amount_valid
count 1000 1000 1000
unique 2 2 2
top True True True
freq 999 999 796
is_credit_score_valid is_valid
count 1000 1000
unique 1 2
top True True
freq 1000 795
# 3. 数据清洗和异常值处理
# 标记不合理数据
invalid_rows = data[~data['is_valid']]
# 删除不合理数据行
cleaned_data = data[data['is_valid']]
# 删除标记列
cleaned_data = cleaned_data.drop(columns=['is_age_valid', 'is_income_valid', 'is_loan_amount_valid', 'is_credit_score_valid', 'is_valid'])
# 保存清洗后的数据
cleaned_data.to_csv('cleaned_credit_data.csv', index=False)
print("数据清洗完成,已保存为 'cleaned_credit_data.csv'")
数据清洗完成,已保存为 'cleaned_credit_data.csv'