chore(project): 初始化项目结构和配置

- 添加 .idea 目录和相关配置文件,设置项目忽略文件、编码、模块管理等
- 创建商务大数据分析目录和子目录,准备数据和任务笔记本
- 添加示例数据文件:中国城市人口数据.csv
- 创建任务笔记本文件,进行数据处理和分析示例
This commit is contained in:
dev_xulongjin 2025-04-14 16:06:13 +08:00
commit 655911b748
15 changed files with 29981 additions and 0 deletions

8
.idea/.gitignore generated vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

6
.idea/encodings.xml generated Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/商务大数据分析/20250402/data/中国城市人口数据.csv" charset="GBK" />
</component>
</project>

8
.idea/gcc-project-py-25-2.iml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="$MODULE_DIR$/../../Environment/anaconda3" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

6
.idea/misc.xml generated Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="$PROJECT_DIR$/../../Environment/anaconda3" />
</component>
</project>

8
.idea/modules.xml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/gcc-project-py-25-2.iml" filepath="$PROJECT_DIR$/.idea/gcc-project-py-25-2.iml" />
</modules>
</component>
</project>

4
.idea/vcs.xml generated Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings" defaultProject="true" />
</project>

View File

@ -0,0 +1,23 @@
省份,2020年人口万人,2019年人口万人
河北省,7461,7447
山西省,3492,3497
辽宁省,4259,4277
吉林省,2407,2448
江苏省,8475,8469
浙江省,6457,6375
安徽省,6103,6092
福建省,4154,4137
江西省,4519,4516
山东省,10153,10106
河南省,9937,9901
湖北省,5775,5927
湖南省,6644,6640
广东省,12601,12489
海南省,1008,995
四川省,8367,8351
贵州省,3856,3848
云南省,4721,4714
陕西省,3953,3944
甘肃省,2502,2509
青海省,592,590
黑龙江省,3185,3255
1 省份 2020年人口(万人) 2019年人口(万人)
2 河北省 7461 7447
3 山西省 3492 3497
4 辽宁省 4259 4277
5 吉林省 2407 2448
6 江苏省 8475 8469
7 浙江省 6457 6375
8 安徽省 6103 6092
9 福建省 4154 4137
10 江西省 4519 4516
11 山东省 10153 10106
12 河南省 9937 9901
13 湖北省 5775 5927
14 湖南省 6644 6640
15 广东省 12601 12489
16 海南省 1008 995
17 四川省 8367 8351
18 贵州省 3856 3848
19 云南省 4721 4714
20 陕西省 3953 3944
21 甘肃省 2502 2509
22 青海省 592 590
23 黑龙江省 3185 3255

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,699 @@
{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-04-02T07:51:13.983021Z",
"start_time": "2025-04-02T07:51:13.980852Z"
}
},
"source": "import pandas as pd",
"outputs": [],
"execution_count": 113
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.035104Z",
"start_time": "2025-04-02T07:51:14.008139Z"
}
},
"cell_type": "code",
"source": [
"data = pd.read_csv('data/某地区房屋销售数据 (1).csv', encoding='gbk')\n",
"data.head(5)"
],
"id": "6f3a167b4381943a",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数\n",
"0 2010/1/4 0:00 2615 435000 house 3\n",
"1 2010/1/5 0:00 2904 712000 house 4\n",
"2 2010/1/6 0:00 2617 435000 house 4\n",
"3 2010/1/6 0:00 2606 1350000 house 5\n",
"4 2010/1/7 0:00 2905 612500 house 4"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2617</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2606</td>\n",
" <td>1350000</td>\n",
" <td>house</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2010/1/7 0:00</td>\n",
" <td>2905</td>\n",
" <td>612500</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 114
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.079308Z",
"start_time": "2025-04-02T07:51:14.069694Z"
}
},
"cell_type": "code",
"source": [
"data['new_postcode'] = data['地区邮编'].apply(lambda x: str(x)[:2])\n",
"data.head(5)"
],
"id": "817b591e756eaf93",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数 new_postcode\n",
"0 2010/1/4 0:00 2615 435000 house 3 26\n",
"1 2010/1/5 0:00 2904 712000 house 4 29\n",
"2 2010/1/6 0:00 2617 435000 house 4 26\n",
"3 2010/1/6 0:00 2606 1350000 house 5 26\n",
"4 2010/1/7 0:00 2905 612500 house 4 29"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" <th>new_postcode</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2617</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2606</td>\n",
" <td>1350000</td>\n",
" <td>house</td>\n",
" <td>5</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2010/1/7 0:00</td>\n",
" <td>2905</td>\n",
" <td>612500</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 115
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.136665Z",
"start_time": "2025-04-02T07:51:14.129644Z"
}
},
"cell_type": "code",
"source": "data.groupby('new_postcode').agg({'房屋出售时间':'count'})",
"id": "4f648cd98de38213",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间\n",
"new_postcode \n",
"26 16393\n",
"29 10975"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" </tr>\n",
" <tr>\n",
" <th>new_postcode</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>16393</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>10975</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 116
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.229857Z",
"start_time": "2025-04-02T07:51:14.216154Z"
}
},
"cell_type": "code",
"source": [
"housesale1 = data.groupby(['房屋类型', 'new_postcode']).apply(lambda x:x).reset_index()\n",
"housesale1"
],
"id": "31e96124eb1769ea",
"outputs": [
{
"data": {
"text/plain": [
" index 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数 new_postcode\n",
"0 0 2010/1/4 0:00 2615 435000 house 3 26\n",
"1 1 2010/1/5 0:00 2904 712000 house 4 29\n",
"2 2 2010/1/6 0:00 2617 435000 house 4 26\n",
"3 3 2010/1/6 0:00 2606 1350000 house 5 26\n",
"4 4 2010/1/7 0:00 2905 612500 house 4 29\n",
"... ... ... ... ... ... ... ...\n",
"27363 27363 2019/7/25 0:00 2900 500000 unit 3 29\n",
"27364 27364 2019/7/25 0:00 2612 560000 unit 2 26\n",
"27365 27365 2019/7/26 0:00 2912 464950 unit 2 29\n",
"27366 27366 2019/7/26 0:00 2601 589000 unit 2 26\n",
"27367 27367 2019/7/26 0:00 2612 775000 unit 2 26\n",
"\n",
"[27368 rows x 7 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" <th>new_postcode</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2617</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2606</td>\n",
" <td>1350000</td>\n",
" <td>house</td>\n",
" <td>5</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>2010/1/7 0:00</td>\n",
" <td>2905</td>\n",
" <td>612500</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27363</th>\n",
" <td>27363</td>\n",
" <td>2019/7/25 0:00</td>\n",
" <td>2900</td>\n",
" <td>500000</td>\n",
" <td>unit</td>\n",
" <td>3</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27364</th>\n",
" <td>27364</td>\n",
" <td>2019/7/25 0:00</td>\n",
" <td>2612</td>\n",
" <td>560000</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27365</th>\n",
" <td>27365</td>\n",
" <td>2019/7/26 0:00</td>\n",
" <td>2912</td>\n",
" <td>464950</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27366</th>\n",
" <td>27366</td>\n",
" <td>2019/7/26 0:00</td>\n",
" <td>2601</td>\n",
" <td>589000</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27367</th>\n",
" <td>27367</td>\n",
" <td>2019/7/26 0:00</td>\n",
" <td>2612</td>\n",
" <td>775000</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>26</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>27368 rows × 7 columns</p>\n",
"</div>"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 117
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.304214Z",
"start_time": "2025-04-02T07:51:14.298702Z"
}
},
"cell_type": "code",
"source": "data['平均价格'] = data.groupby(['房屋类型', 'new_postcode'])['房屋价格'].transform('mean')",
"id": "5249fcce9b76b48f",
"outputs": [],
"execution_count": 118
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:52:09.492950Z",
"start_time": "2025-04-02T07:52:09.480225Z"
}
},
"cell_type": "code",
"source": [
"data2 = data.drop_duplicates(['房屋类型','new_postcode'],inplace=False)\n",
"data2"
],
"id": "93afa495c804a0f6",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数 new_postcode 平均价格\n",
"0 2010/1/4 0:00 2615 435000 house 3 26 725040.113978\n",
"1 2010/1/5 0:00 2904 712000 house 4 29 582085.199671\n",
"22595 2010/1/11 0:00 2602 270000 unit 1 26 434573.470446\n",
"22607 2010/2/9 0:00 2900 436000 unit 2 29 369109.530255"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" <th>new_postcode</th>\n",
" <th>平均价格</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" <td>26</td>\n",
" <td>725040.113978</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" <td>582085.199671</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22595</th>\n",
" <td>2010/1/11 0:00</td>\n",
" <td>2602</td>\n",
" <td>270000</td>\n",
" <td>unit</td>\n",
" <td>1</td>\n",
" <td>26</td>\n",
" <td>434573.470446</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22607</th>\n",
" <td>2010/2/9 0:00</td>\n",
" <td>2900</td>\n",
" <td>436000</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>29</td>\n",
" <td>369109.530255</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 125
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:52:17.149242Z",
"start_time": "2025-04-02T07:52:17.142432Z"
}
},
"cell_type": "code",
"source": "data2[['房屋类型','new_postcode','平均价格']]",
"id": "9dd96081baad6b3d",
"outputs": [
{
"data": {
"text/plain": [
" 房屋类型 new_postcode 平均价格\n",
"0 house 26 725040.113978\n",
"1 house 29 582085.199671\n",
"22595 unit 26 434573.470446\n",
"22607 unit 29 369109.530255"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋类型</th>\n",
" <th>new_postcode</th>\n",
" <th>平均价格</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>house</td>\n",
" <td>26</td>\n",
" <td>725040.113978</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>house</td>\n",
" <td>29</td>\n",
" <td>582085.199671</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22595</th>\n",
" <td>unit</td>\n",
" <td>26</td>\n",
" <td>434573.470446</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22607</th>\n",
" <td>unit</td>\n",
" <td>29</td>\n",
" <td>369109.530255</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 126
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,478 @@
{
"cells": [
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:57:56.382179Z",
"start_time": "2025-04-02T07:57:55.984261Z"
}
},
"cell_type": "code",
"source": "import pandas as pd",
"id": "3244cf38b10be81b",
"outputs": [],
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T08:00:15.267189Z",
"start_time": "2025-04-02T08:00:15.229542Z"
}
},
"cell_type": "code",
"source": [
"data = pd.read_csv('data/某地区房屋销售数据 (1).csv', encoding='gbk')\n",
"data['new_postcode'] = data['地区邮编'].apply(lambda x: str(x)[:2])\n",
"data.head(3)"
],
"id": "d973cf9fe6ac90a6",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数 new_postcode\n",
"0 2010/1/4 0:00 2615 435000 house 3 26\n",
"1 2010/1/5 0:00 2904 712000 house 4 29\n",
"2 2010/1/6 0:00 2617 435000 house 4 26"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" <th>new_postcode</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2617</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>26</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T08:00:25.320359Z",
"start_time": "2025-04-02T08:00:25.301349Z"
}
},
"cell_type": "code",
"source": [
"# 1、求出不同地区和不同房间数的房价使用pivot_table函数\n",
"data.pivot_table(values='房屋价格', index='new_postcode', columns='配套房间数', aggfunc='mean')"
],
"id": "c9d4b29b2fbd4334",
"outputs": [
{
"data": {
"text/plain": [
"配套房间数 0 1 2 3 \\\n",
"new_postcode \n",
"26 564125.0 343189.962401 457595.588277 624204.46900 \n",
"29 528000.0 292934.514286 381675.627240 475210.25609 \n",
"\n",
"配套房间数 4 5 \n",
"new_postcode \n",
"26 810389.319007 1.037034e+06 \n",
"29 651102.874716 7.995584e+05 "
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>配套房间数</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" </tr>\n",
" <tr>\n",
" <th>new_postcode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>564125.0</td>\n",
" <td>343189.962401</td>\n",
" <td>457595.588277</td>\n",
" <td>624204.46900</td>\n",
" <td>810389.319007</td>\n",
" <td>1.037034e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>528000.0</td>\n",
" <td>292934.514286</td>\n",
" <td>381675.627240</td>\n",
" <td>475210.25609</td>\n",
" <td>651102.874716</td>\n",
" <td>7.995584e+05</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 5
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T08:04:02.430064Z",
"start_time": "2025-04-02T08:04:02.415284Z"
}
},
"cell_type": "code",
"source": [
"# 2、不同地区哪种类型的房产房价最贵使用pivot_table函数\n",
"data.pivot_table(values='房屋价格', index='new_postcode', columns='房屋类型', aggfunc='max')"
],
"id": "a5e4f3321d168313",
"outputs": [
{
"data": {
"text/plain": [
"房屋类型 house unit\n",
"new_postcode \n",
"26 8000000 2500000\n",
"29 5425000 769500"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>房屋类型</th>\n",
" <th>house</th>\n",
" <th>unit</th>\n",
" </tr>\n",
" <tr>\n",
" <th>new_postcode</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>8000000</td>\n",
" <td>2500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>5425000</td>\n",
" <td>769500</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 7
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T08:04:41.035870Z",
"start_time": "2025-04-02T08:04:41.012959Z"
}
},
"cell_type": "code",
"source": [
"# 3、不同类型房产和不同房间数的房价之间的比较使用pivot_table函数\n",
"data.pivot_table(values='房屋价格', index='房屋类型', columns='配套房间数', aggfunc='mean')"
],
"id": "4ed9b36daea1c503",
"outputs": [
{
"data": {
"text/plain": [
"配套房间数 0 1 2 3 \\\n",
"房屋类型 \n",
"house 677394.736842 353634.269663 489555.889339 560117.683516 \n",
"unit 330850.000000 336570.325391 432502.153116 594535.982287 \n",
"\n",
"配套房间数 4 5 \n",
"房屋类型 \n",
"house 730667.024375 9.290297e+05 \n",
"unit 641736.842105 1.146333e+06 "
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>配套房间数</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" </tr>\n",
" <tr>\n",
" <th>房屋类型</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>house</th>\n",
" <td>677394.736842</td>\n",
" <td>353634.269663</td>\n",
" <td>489555.889339</td>\n",
" <td>560117.683516</td>\n",
" <td>730667.024375</td>\n",
" <td>9.290297e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unit</th>\n",
" <td>330850.000000</td>\n",
" <td>336570.325391</td>\n",
" <td>432502.153116</td>\n",
" <td>594535.982287</td>\n",
" <td>641736.842105</td>\n",
" <td>1.146333e+06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 8
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T08:05:23.703349Z",
"start_time": "2025-04-02T08:05:23.691916Z"
}
},
"cell_type": "code",
"source": [
"# 4、不同地区不同房间数房屋销售情况交叉表使用crosstab函数参考例3-61\n",
"pd.crosstab(data['new_postcode'], data['配套房间数'])"
],
"id": "799d99489d93b2b5",
"outputs": [
{
"data": {
"text/plain": [
"配套房间数 0 1 2 3 4 5\n",
"new_postcode \n",
"26 24 1383 2815 6371 4793 1007\n",
"29 5 175 558 4557 4845 835"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>配套房间数</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" </tr>\n",
" <tr>\n",
" <th>new_postcode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>24</td>\n",
" <td>1383</td>\n",
" <td>2815</td>\n",
" <td>6371</td>\n",
" <td>4793</td>\n",
" <td>1007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>5</td>\n",
" <td>175</td>\n",
" <td>558</td>\n",
" <td>4557</td>\n",
" <td>4845</td>\n",
" <td>835</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 10
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,464 @@
{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-04-08T10:57:29.263616Z",
"start_time": "2025-04-08T10:57:28.865194Z"
}
},
"source": "import pandas as pd",
"outputs": [],
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T10:59:11.991479Z",
"start_time": "2025-04-08T10:59:11.985778Z"
}
},
"cell_type": "code",
"source": [
"data = pd.read_csv('data/中国城市人口数据.csv',encoding=\"GBK\")\n",
"data.head(5)"
],
"id": "c3fd933261d1f7fb",
"outputs": [
{
"data": {
"text/plain": [
" 省份 2020年人口万人 2019年人口万人\n",
"0 河北省 7461 7447\n",
"1 山西省 3492 3497\n",
"2 辽宁省 4259 4277\n",
"3 吉林省 2407 2448\n",
"4 江苏省 8475 8469"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>省份</th>\n",
" <th>2020年人口万人</th>\n",
" <th>2019年人口万人</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>河北省</td>\n",
" <td>7461</td>\n",
" <td>7447</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>山西省</td>\n",
" <td>3492</td>\n",
" <td>3497</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>辽宁省</td>\n",
" <td>4259</td>\n",
" <td>4277</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>吉林省</td>\n",
" <td>2407</td>\n",
" <td>2448</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>江苏省</td>\n",
" <td>8475</td>\n",
" <td>8469</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:10:42.958649Z",
"start_time": "2025-04-08T11:10:42.956278Z"
}
},
"cell_type": "code",
"source": "data.shape",
"id": "a0d05b5dea7e5cfc",
"outputs": [
{
"data": {
"text/plain": [
"(22, 3)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 15
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:10:37.015499Z",
"start_time": "2025-04-08T11:10:37.005663Z"
}
},
"cell_type": "code",
"source": "data.info()",
"id": "b602f50b182485dd",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 22 entries, 0 to 21\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 省份 22 non-null object\n",
" 1 2020年人口万人 22 non-null int64 \n",
" 2 2019年人口万人 22 non-null int64 \n",
"dtypes: int64(2), object(1)\n",
"memory usage: 656.0+ bytes\n"
]
}
],
"execution_count": 13
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:01:22.100819Z",
"start_time": "2025-04-08T11:01:22.080321Z"
}
},
"cell_type": "code",
"source": "data.describe()",
"id": "1218c2b44c21d012",
"outputs": [
{
"data": {
"text/plain": [
" 2020年人口万人 2019年人口万人\n",
"count 22.000000 22.000000\n",
"mean 5482.772727 5478.500000\n",
"std 3067.216187 3043.789239\n",
"min 592.000000 590.000000\n",
"25% 3583.000000 3584.750000\n",
"50% 4620.000000 4615.000000\n",
"75% 7256.750000 7245.250000\n",
"max 12601.000000 12489.000000"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>2020年人口万人</th>\n",
" <th>2019年人口万人</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>22.000000</td>\n",
" <td>22.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>5482.772727</td>\n",
" <td>5478.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>3067.216187</td>\n",
" <td>3043.789239</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>592.000000</td>\n",
" <td>590.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>3583.000000</td>\n",
" <td>3584.750000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>4620.000000</td>\n",
" <td>4615.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>7256.750000</td>\n",
" <td>7245.250000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>12601.000000</td>\n",
" <td>12489.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 7
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:10:29.629195Z",
"start_time": "2025-04-08T11:10:29.625545Z"
}
},
"cell_type": "code",
"source": [
"# 2020年总人口\n",
"data['2020年人口万人'].sum()"
],
"id": "93faae0d69a5d4e2",
"outputs": [
{
"data": {
"text/plain": [
"120621"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 12
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:11:33.375487Z",
"start_time": "2025-04-08T11:11:33.361781Z"
}
},
"cell_type": "code",
"source": [
"# 2019年总人口\n",
"data['2019年人口万人'].sum()"
],
"id": "81d587e3605ba734",
"outputs": [
{
"data": {
"text/plain": [
"120527"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 16
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:13:18.823381Z",
"start_time": "2025-04-08T11:13:18.809605Z"
}
},
"cell_type": "code",
"source": [
"# 总人口对比\n",
"data['2020年人口万人'].sum() - data['2019年人口万人'].sum()"
],
"id": "2e5f8e1821c05cdf",
"outputs": [
{
"data": {
"text/plain": [
"94"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 20
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:13:20.739094Z",
"start_time": "2025-04-08T11:13:20.731449Z"
}
},
"cell_type": "code",
"source": [
"# 各省人口对比\n",
"data2 = data\n",
"data2['compare'] = data2['2020年人口万人'] - data2['2019年人口万人']\n",
"data2.head(5)"
],
"id": "e7bda6c2c79305bb",
"outputs": [
{
"data": {
"text/plain": [
" 省份 2020年人口万人 2019年人口万人 compare\n",
"0 河北省 7461 7447 14\n",
"1 山西省 3492 3497 -5\n",
"2 辽宁省 4259 4277 -18\n",
"3 吉林省 2407 2448 -41\n",
"4 江苏省 8475 8469 6"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>省份</th>\n",
" <th>2020年人口万人</th>\n",
" <th>2019年人口万人</th>\n",
" <th>compare</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>河北省</td>\n",
" <td>7461</td>\n",
" <td>7447</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>山西省</td>\n",
" <td>3492</td>\n",
" <td>3497</td>\n",
" <td>-5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>辽宁省</td>\n",
" <td>4259</td>\n",
" <td>4277</td>\n",
" <td>-18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>吉林省</td>\n",
" <td>2407</td>\n",
" <td>2448</td>\n",
" <td>-41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>江苏省</td>\n",
" <td>8475</td>\n",
" <td>8469</td>\n",
" <td>6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 21
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,902 @@
{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-04-14T02:39:40.769558Z",
"start_time": "2025-04-14T02:39:40.456570Z"
}
},
"source": "import pandas as pd",
"outputs": [],
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-14T02:41:58.436846Z",
"start_time": "2025-04-14T02:41:58.386566Z"
}
},
"cell_type": "code",
"source": [
"data1 = pd.read_excel('data/healthcare-dataset-stroke.xlsx')\n",
"data1.head(3)"
],
"id": "4b3c42b38f05d480",
"outputs": [
{
"data": {
"text/plain": [
" 编号 性别 高血压 是否结婚 工作类型 居住类型 体重指数 吸烟史 中风\n",
"0 9046 男 否 是 私人 城市 36.6 以前吸烟 是\n",
"1 51676 女 否 是 私营企业 农村 NaN 从不吸烟 是\n",
"2 31112 男 否 是 私人 农村 32.5 从不吸烟 是"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>编号</th>\n",
" <th>性别</th>\n",
" <th>高血压</th>\n",
" <th>是否结婚</th>\n",
" <th>工作类型</th>\n",
" <th>居住类型</th>\n",
" <th>体重指数</th>\n",
" <th>吸烟史</th>\n",
" <th>中风</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9046</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>是</td>\n",
" <td>私人</td>\n",
" <td>城市</td>\n",
" <td>36.6</td>\n",
" <td>以前吸烟</td>\n",
" <td>是</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>51676</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>是</td>\n",
" <td>私营企业</td>\n",
" <td>农村</td>\n",
" <td>NaN</td>\n",
" <td>从不吸烟</td>\n",
" <td>是</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>31112</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>是</td>\n",
" <td>私人</td>\n",
" <td>农村</td>\n",
" <td>32.5</td>\n",
" <td>从不吸烟</td>\n",
" <td>是</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 8
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-14T02:42:02.131783Z",
"start_time": "2025-04-14T02:42:02.114377Z"
}
},
"cell_type": "code",
"source": [
"data2 = pd.read_excel('data/healthcare-dataset-age_abs.xlsx')\n",
"data2.head(3)"
],
"id": "e72f2e11a9b2e88d",
"outputs": [
{
"data": {
"text/plain": [
" 编号 年龄 平均血糖\n",
"0 9046 67.0 228.69\n",
"1 51676 61.0 202.21\n",
"2 31112 80.0 105.92"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>编号</th>\n",
" <th>年龄</th>\n",
" <th>平均血糖</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9046</td>\n",
" <td>67.0</td>\n",
" <td>228.69</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>51676</td>\n",
" <td>61.0</td>\n",
" <td>202.21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>31112</td>\n",
" <td>80.0</td>\n",
" <td>105.92</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 10
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-14T02:44:09.987977Z",
"start_time": "2025-04-14T02:44:09.985187Z"
}
},
"cell_type": "code",
"source": [
"print(data1.size)\n",
"data2.size"
],
"id": "40c26c71f24c511d",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"15903\n"
]
},
{
"data": {
"text/plain": [
"5301"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 17
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-14T07:59:22.335960Z",
"start_time": "2025-04-14T07:59:22.326530Z"
}
},
"cell_type": "code",
"source": [
"merge_data = data1.merge(data2, on=['编号'], how='left')\n",
"merge_data.head(3)"
],
"id": "37f42c042c31af5e",
"outputs": [
{
"data": {
"text/plain": [
" 编号 性别 高血压 是否结婚 工作类型 居住类型 体重指数 吸烟史 中风 年龄 平均血糖\n",
"0 9046 男 否 是 私人 城市 36.6 以前吸烟 是 67.0 228.69\n",
"1 51676 女 否 是 私营企业 农村 NaN 从不吸烟 是 61.0 202.21\n",
"2 31112 男 否 是 私人 农村 32.5 从不吸烟 是 80.0 105.92"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>编号</th>\n",
" <th>性别</th>\n",
" <th>高血压</th>\n",
" <th>是否结婚</th>\n",
" <th>工作类型</th>\n",
" <th>居住类型</th>\n",
" <th>体重指数</th>\n",
" <th>吸烟史</th>\n",
" <th>中风</th>\n",
" <th>年龄</th>\n",
" <th>平均血糖</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9046</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>是</td>\n",
" <td>私人</td>\n",
" <td>城市</td>\n",
" <td>36.6</td>\n",
" <td>以前吸烟</td>\n",
" <td>是</td>\n",
" <td>67.0</td>\n",
" <td>228.69</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>51676</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>是</td>\n",
" <td>私营企业</td>\n",
" <td>农村</td>\n",
" <td>NaN</td>\n",
" <td>从不吸烟</td>\n",
" <td>是</td>\n",
" <td>61.0</td>\n",
" <td>202.21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>31112</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>是</td>\n",
" <td>私人</td>\n",
" <td>农村</td>\n",
" <td>32.5</td>\n",
" <td>从不吸烟</td>\n",
" <td>是</td>\n",
" <td>80.0</td>\n",
" <td>105.92</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 71
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-14T07:59:24.287769Z",
"start_time": "2025-04-14T07:59:24.284471Z"
}
},
"cell_type": "code",
"source": [
"def age_process(x):\n",
" if (x % 1 != 0 or x < 0):\n",
" return None\n",
" return int(x)"
],
"id": "d45e61b4e5c45d4a",
"outputs": [],
"execution_count": 72
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-14T07:59:26.832979Z",
"start_time": "2025-04-14T07:59:26.827710Z"
}
},
"cell_type": "code",
"source": "merge_data['年龄'] = merge_data['年龄'].apply(lambda x: age_process(x))",
"id": "b81f4203662a2950",
"outputs": [],
"execution_count": 73
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-14T07:59:30.620159Z",
"start_time": "2025-04-14T07:59:30.606700Z"
}
},
"cell_type": "code",
"source": "merge_data[merge_data['年龄'].isna()]",
"id": "da4b29e8f3d56bc6",
"outputs": [
{
"data": {
"text/plain": [
" 编号 性别 高血压 是否结婚 工作类型 居住类型 体重指数 吸烟史 中风 年龄 平均血糖\n",
"162 69768 女 否 否 学生 城市 NaN 未知 是 NaN 70.37\n",
"363 7559 女 否 否 学生 城市 24.9 未知 否 NaN 83.82\n",
"376 22706 女 否 否 学生 农村 15.5 未知 否 NaN 88.11\n",
"562 45238 女 否 否 学生 城市 16.5 未知 否 NaN 58.26\n",
"564 61511 女 否 否 学生 农村 16.2 未知 否 NaN 73.71\n",
"597 40639 女 否 否 学生 农村 17.5 未知 否 NaN 60.53\n",
"607 9906 女 否 否 学生 城市 17.0 未知 否 NaN 102.34\n",
"684 53016 女 否 否 学生 城市 14.4 未知 否 NaN 130.61\n",
"753 49529 女 否 否 学生 城市 17.2 未知 否 NaN 60.98\n",
"850 41615 女 否 否 学生 农村 18.1 未知 否 NaN 126.18\n",
"913 17733 女 否 否 学生 农村 19.5 未知 否 NaN 109.51\n",
"982 54747 男 否 否 学生 农村 19.2 未知 否 NaN 157.57\n",
"995 60211 男 否 否 学生 城市 18.9 未知 否 NaN 90.51\n",
"996 53279 男 否 否 学生 农村 16.3 未知 否 NaN 118.87\n",
"1093 66772 女 否 否 学生 农村 16.0 未知 否 NaN 55.86\n",
"1101 57854 男 否 否 学生 城市 19.7 未知 否 NaN 56.30\n",
"1134 47848 男 否 否 学生 农村 20.1 未知 否 NaN 93.74\n",
"1137 59734 男 否 否 学生 城市 17.6 未知 否 NaN 75.79\n",
"1206 68908 女 否 否 学生 城市 23.0 未知 否 NaN 66.36\n",
"1218 20282 男 否 否 学生 农村 21.8 未知 否 NaN 77.91\n",
"1244 45554 女 否 否 学生 城市 22.1 未知 否 NaN 62.40\n",
"1317 30084 男 否 否 学生 农村 17.5 未知 否 NaN 98.67\n",
"1366 35737 男 否 否 学生 城市 19.5 未知 否 NaN 86.09\n",
"1486 1405 男 否 否 学生 城市 16.3 未知 否 NaN 111.65\n",
"1499 45357 女 否 否 学生 农村 21.5 未知 否 NaN 113.96\n",
"1600 40544 男 否 否 学生 城市 14.3 未知 否 NaN 109.56\n",
"1609 38043 女 否 否 学生 农村 10.3 未知 否 NaN 122.04\n",
"1614 47350 女 否 否 学生 城市 14.1 未知 否 NaN 139.67\n",
"1632 57485 女 否 否 学生 农村 18.5 未知 否 NaN 55.51\n",
"1758 27279 男 否 否 学生 城市 22.5 未知 否 NaN 90.46"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>编号</th>\n",
" <th>性别</th>\n",
" <th>高血压</th>\n",
" <th>是否结婚</th>\n",
" <th>工作类型</th>\n",
" <th>居住类型</th>\n",
" <th>体重指数</th>\n",
" <th>吸烟史</th>\n",
" <th>中风</th>\n",
" <th>年龄</th>\n",
" <th>平均血糖</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>69768</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>NaN</td>\n",
" <td>未知</td>\n",
" <td>是</td>\n",
" <td>NaN</td>\n",
" <td>70.37</td>\n",
" </tr>\n",
" <tr>\n",
" <th>363</th>\n",
" <td>7559</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>24.9</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>83.82</td>\n",
" </tr>\n",
" <tr>\n",
" <th>376</th>\n",
" <td>22706</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>15.5</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>88.11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>562</th>\n",
" <td>45238</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>16.5</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>58.26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>564</th>\n",
" <td>61511</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>16.2</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>73.71</td>\n",
" </tr>\n",
" <tr>\n",
" <th>597</th>\n",
" <td>40639</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>17.5</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>60.53</td>\n",
" </tr>\n",
" <tr>\n",
" <th>607</th>\n",
" <td>9906</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>17.0</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>102.34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>684</th>\n",
" <td>53016</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>14.4</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>130.61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>753</th>\n",
" <td>49529</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>17.2</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>60.98</td>\n",
" </tr>\n",
" <tr>\n",
" <th>850</th>\n",
" <td>41615</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>18.1</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>126.18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>913</th>\n",
" <td>17733</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>19.5</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>109.51</td>\n",
" </tr>\n",
" <tr>\n",
" <th>982</th>\n",
" <td>54747</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>19.2</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>157.57</td>\n",
" </tr>\n",
" <tr>\n",
" <th>995</th>\n",
" <td>60211</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>18.9</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>90.51</td>\n",
" </tr>\n",
" <tr>\n",
" <th>996</th>\n",
" <td>53279</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>16.3</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>118.87</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1093</th>\n",
" <td>66772</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>16.0</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>55.86</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1101</th>\n",
" <td>57854</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>19.7</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>56.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1134</th>\n",
" <td>47848</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>20.1</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>93.74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1137</th>\n",
" <td>59734</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>17.6</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>75.79</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1206</th>\n",
" <td>68908</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>23.0</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>66.36</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1218</th>\n",
" <td>20282</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>21.8</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>77.91</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1244</th>\n",
" <td>45554</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>22.1</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>62.40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1317</th>\n",
" <td>30084</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>17.5</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>98.67</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1366</th>\n",
" <td>35737</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>19.5</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>86.09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1486</th>\n",
" <td>1405</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>16.3</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>111.65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1499</th>\n",
" <td>45357</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>21.5</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>113.96</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1600</th>\n",
" <td>40544</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>14.3</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>109.56</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1609</th>\n",
" <td>38043</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>10.3</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>122.04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1614</th>\n",
" <td>47350</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>14.1</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>139.67</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1632</th>\n",
" <td>57485</td>\n",
" <td>女</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>农村</td>\n",
" <td>18.5</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>55.51</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1758</th>\n",
" <td>27279</td>\n",
" <td>男</td>\n",
" <td>否</td>\n",
" <td>否</td>\n",
" <td>学生</td>\n",
" <td>城市</td>\n",
" <td>22.5</td>\n",
" <td>未知</td>\n",
" <td>否</td>\n",
" <td>NaN</td>\n",
" <td>90.46</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 74
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}