chore(project): 初始化项目结构和配置

- 添加 .idea 目录和相关配置文件,设置项目忽略文件、编码、模块管理等
- 创建商务大数据分析目录和子目录,准备数据和任务笔记本
- 添加示例数据文件:中国城市人口数据.csv
- 创建任务笔记本文件,进行数据处理和分析示例
This commit is contained in:
2025-04-14 16:06:13 +08:00
commit 655911b748
15 changed files with 29981 additions and 0 deletions

View File

@@ -0,0 +1,23 @@
ʡ<EFBFBD><EFBFBD>,2020<EFBFBD><EFBFBD><EFBFBD>˿ڣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ˣ<EFBFBD>,2019<EFBFBD><EFBFBD><EFBFBD>˿ڣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ˣ<EFBFBD>
<EFBFBD>ӱ<EFBFBD>ʡ,7461,7447
ɽ<EFBFBD><EFBFBD>ʡ,3492,3497
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,4259,4277
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,2407,2448
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,8475,8469
<EFBFBD>㽭ʡ,6457,6375
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,6103,6092
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,4154,4137
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,4519,4516
ɽ<EFBFBD><EFBFBD>ʡ,10153,10106
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,9937,9901
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,5775,5927
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,6644,6640
<EFBFBD>㶫ʡ,12601,12489
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,1008,995
<EFBFBD>Ĵ<EFBFBD>ʡ,8367,8351
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,3856,3848
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,4721,4714
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,3953,3944
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,2502,2509
<EFBFBD>ຣʡ,592,590
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʡ,3185,3255
1 省份 2020年人口(万人) 2019年人口(万人)
2 河北省 7461 7447
3 山西省 3492 3497
4 辽宁省 4259 4277
5 吉林省 2407 2448
6 江苏省 8475 8469
7 浙江省 6457 6375
8 安徽省 6103 6092
9 福建省 4154 4137
10 江西省 4519 4516
11 山东省 10153 10106
12 河南省 9937 9901
13 湖北省 5775 5927
14 湖南省 6644 6640
15 广东省 12601 12489
16 海南省 1008 995
17 四川省 8367 8351
18 贵州省 3856 3848
19 云南省 4721 4714
20 陕西省 3953 3944
21 甘肃省 2502 2509
22 青海省 592 590
23 黑龙江省 3185 3255

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,699 @@
{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-04-02T07:51:13.983021Z",
"start_time": "2025-04-02T07:51:13.980852Z"
}
},
"source": "import pandas as pd",
"outputs": [],
"execution_count": 113
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.035104Z",
"start_time": "2025-04-02T07:51:14.008139Z"
}
},
"cell_type": "code",
"source": [
"data = pd.read_csv('data/某地区房屋销售数据 (1).csv', encoding='gbk')\n",
"data.head(5)"
],
"id": "6f3a167b4381943a",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数\n",
"0 2010/1/4 0:00 2615 435000 house 3\n",
"1 2010/1/5 0:00 2904 712000 house 4\n",
"2 2010/1/6 0:00 2617 435000 house 4\n",
"3 2010/1/6 0:00 2606 1350000 house 5\n",
"4 2010/1/7 0:00 2905 612500 house 4"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2617</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2606</td>\n",
" <td>1350000</td>\n",
" <td>house</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2010/1/7 0:00</td>\n",
" <td>2905</td>\n",
" <td>612500</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 114
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.079308Z",
"start_time": "2025-04-02T07:51:14.069694Z"
}
},
"cell_type": "code",
"source": [
"data['new_postcode'] = data['地区邮编'].apply(lambda x: str(x)[:2])\n",
"data.head(5)"
],
"id": "817b591e756eaf93",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数 new_postcode\n",
"0 2010/1/4 0:00 2615 435000 house 3 26\n",
"1 2010/1/5 0:00 2904 712000 house 4 29\n",
"2 2010/1/6 0:00 2617 435000 house 4 26\n",
"3 2010/1/6 0:00 2606 1350000 house 5 26\n",
"4 2010/1/7 0:00 2905 612500 house 4 29"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" <th>new_postcode</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2617</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2606</td>\n",
" <td>1350000</td>\n",
" <td>house</td>\n",
" <td>5</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2010/1/7 0:00</td>\n",
" <td>2905</td>\n",
" <td>612500</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 115
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.136665Z",
"start_time": "2025-04-02T07:51:14.129644Z"
}
},
"cell_type": "code",
"source": "data.groupby('new_postcode').agg({'房屋出售时间':'count'})",
"id": "4f648cd98de38213",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间\n",
"new_postcode \n",
"26 16393\n",
"29 10975"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" </tr>\n",
" <tr>\n",
" <th>new_postcode</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>16393</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>10975</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 116
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.229857Z",
"start_time": "2025-04-02T07:51:14.216154Z"
}
},
"cell_type": "code",
"source": [
"housesale1 = data.groupby(['房屋类型', 'new_postcode']).apply(lambda x:x).reset_index()\n",
"housesale1"
],
"id": "31e96124eb1769ea",
"outputs": [
{
"data": {
"text/plain": [
" index 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数 new_postcode\n",
"0 0 2010/1/4 0:00 2615 435000 house 3 26\n",
"1 1 2010/1/5 0:00 2904 712000 house 4 29\n",
"2 2 2010/1/6 0:00 2617 435000 house 4 26\n",
"3 3 2010/1/6 0:00 2606 1350000 house 5 26\n",
"4 4 2010/1/7 0:00 2905 612500 house 4 29\n",
"... ... ... ... ... ... ... ...\n",
"27363 27363 2019/7/25 0:00 2900 500000 unit 3 29\n",
"27364 27364 2019/7/25 0:00 2612 560000 unit 2 26\n",
"27365 27365 2019/7/26 0:00 2912 464950 unit 2 29\n",
"27366 27366 2019/7/26 0:00 2601 589000 unit 2 26\n",
"27367 27367 2019/7/26 0:00 2612 775000 unit 2 26\n",
"\n",
"[27368 rows x 7 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" <th>new_postcode</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2617</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2606</td>\n",
" <td>1350000</td>\n",
" <td>house</td>\n",
" <td>5</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>2010/1/7 0:00</td>\n",
" <td>2905</td>\n",
" <td>612500</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27363</th>\n",
" <td>27363</td>\n",
" <td>2019/7/25 0:00</td>\n",
" <td>2900</td>\n",
" <td>500000</td>\n",
" <td>unit</td>\n",
" <td>3</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27364</th>\n",
" <td>27364</td>\n",
" <td>2019/7/25 0:00</td>\n",
" <td>2612</td>\n",
" <td>560000</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27365</th>\n",
" <td>27365</td>\n",
" <td>2019/7/26 0:00</td>\n",
" <td>2912</td>\n",
" <td>464950</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27366</th>\n",
" <td>27366</td>\n",
" <td>2019/7/26 0:00</td>\n",
" <td>2601</td>\n",
" <td>589000</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27367</th>\n",
" <td>27367</td>\n",
" <td>2019/7/26 0:00</td>\n",
" <td>2612</td>\n",
" <td>775000</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>26</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>27368 rows × 7 columns</p>\n",
"</div>"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 117
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.304214Z",
"start_time": "2025-04-02T07:51:14.298702Z"
}
},
"cell_type": "code",
"source": "data['平均价格'] = data.groupby(['房屋类型', 'new_postcode'])['房屋价格'].transform('mean')",
"id": "5249fcce9b76b48f",
"outputs": [],
"execution_count": 118
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:52:09.492950Z",
"start_time": "2025-04-02T07:52:09.480225Z"
}
},
"cell_type": "code",
"source": [
"data2 = data.drop_duplicates(['房屋类型','new_postcode'],inplace=False)\n",
"data2"
],
"id": "93afa495c804a0f6",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数 new_postcode 平均价格\n",
"0 2010/1/4 0:00 2615 435000 house 3 26 725040.113978\n",
"1 2010/1/5 0:00 2904 712000 house 4 29 582085.199671\n",
"22595 2010/1/11 0:00 2602 270000 unit 1 26 434573.470446\n",
"22607 2010/2/9 0:00 2900 436000 unit 2 29 369109.530255"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" <th>new_postcode</th>\n",
" <th>平均价格</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" <td>26</td>\n",
" <td>725040.113978</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" <td>582085.199671</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22595</th>\n",
" <td>2010/1/11 0:00</td>\n",
" <td>2602</td>\n",
" <td>270000</td>\n",
" <td>unit</td>\n",
" <td>1</td>\n",
" <td>26</td>\n",
" <td>434573.470446</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22607</th>\n",
" <td>2010/2/9 0:00</td>\n",
" <td>2900</td>\n",
" <td>436000</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>29</td>\n",
" <td>369109.530255</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 125
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:52:17.149242Z",
"start_time": "2025-04-02T07:52:17.142432Z"
}
},
"cell_type": "code",
"source": "data2[['房屋类型','new_postcode','平均价格']]",
"id": "9dd96081baad6b3d",
"outputs": [
{
"data": {
"text/plain": [
" 房屋类型 new_postcode 平均价格\n",
"0 house 26 725040.113978\n",
"1 house 29 582085.199671\n",
"22595 unit 26 434573.470446\n",
"22607 unit 29 369109.530255"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋类型</th>\n",
" <th>new_postcode</th>\n",
" <th>平均价格</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>house</td>\n",
" <td>26</td>\n",
" <td>725040.113978</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>house</td>\n",
" <td>29</td>\n",
" <td>582085.199671</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22595</th>\n",
" <td>unit</td>\n",
" <td>26</td>\n",
" <td>434573.470446</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22607</th>\n",
" <td>unit</td>\n",
" <td>29</td>\n",
" <td>369109.530255</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 126
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,478 @@
{
"cells": [
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:57:56.382179Z",
"start_time": "2025-04-02T07:57:55.984261Z"
}
},
"cell_type": "code",
"source": "import pandas as pd",
"id": "3244cf38b10be81b",
"outputs": [],
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T08:00:15.267189Z",
"start_time": "2025-04-02T08:00:15.229542Z"
}
},
"cell_type": "code",
"source": [
"data = pd.read_csv('data/某地区房屋销售数据 (1).csv', encoding='gbk')\n",
"data['new_postcode'] = data['地区邮编'].apply(lambda x: str(x)[:2])\n",
"data.head(3)"
],
"id": "d973cf9fe6ac90a6",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数 new_postcode\n",
"0 2010/1/4 0:00 2615 435000 house 3 26\n",
"1 2010/1/5 0:00 2904 712000 house 4 29\n",
"2 2010/1/6 0:00 2617 435000 house 4 26"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" <th>new_postcode</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2617</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>26</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T08:00:25.320359Z",
"start_time": "2025-04-02T08:00:25.301349Z"
}
},
"cell_type": "code",
"source": [
"# 1、求出不同地区和不同房间数的房价使用pivot_table函数\n",
"data.pivot_table(values='房屋价格', index='new_postcode', columns='配套房间数', aggfunc='mean')"
],
"id": "c9d4b29b2fbd4334",
"outputs": [
{
"data": {
"text/plain": [
"配套房间数 0 1 2 3 \\\n",
"new_postcode \n",
"26 564125.0 343189.962401 457595.588277 624204.46900 \n",
"29 528000.0 292934.514286 381675.627240 475210.25609 \n",
"\n",
"配套房间数 4 5 \n",
"new_postcode \n",
"26 810389.319007 1.037034e+06 \n",
"29 651102.874716 7.995584e+05 "
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>配套房间数</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" </tr>\n",
" <tr>\n",
" <th>new_postcode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>564125.0</td>\n",
" <td>343189.962401</td>\n",
" <td>457595.588277</td>\n",
" <td>624204.46900</td>\n",
" <td>810389.319007</td>\n",
" <td>1.037034e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>528000.0</td>\n",
" <td>292934.514286</td>\n",
" <td>381675.627240</td>\n",
" <td>475210.25609</td>\n",
" <td>651102.874716</td>\n",
" <td>7.995584e+05</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 5
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T08:04:02.430064Z",
"start_time": "2025-04-02T08:04:02.415284Z"
}
},
"cell_type": "code",
"source": [
"# 2、不同地区哪种类型的房产房价最贵使用pivot_table函数\n",
"data.pivot_table(values='房屋价格', index='new_postcode', columns='房屋类型', aggfunc='max')"
],
"id": "a5e4f3321d168313",
"outputs": [
{
"data": {
"text/plain": [
"房屋类型 house unit\n",
"new_postcode \n",
"26 8000000 2500000\n",
"29 5425000 769500"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>房屋类型</th>\n",
" <th>house</th>\n",
" <th>unit</th>\n",
" </tr>\n",
" <tr>\n",
" <th>new_postcode</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>8000000</td>\n",
" <td>2500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>5425000</td>\n",
" <td>769500</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 7
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T08:04:41.035870Z",
"start_time": "2025-04-02T08:04:41.012959Z"
}
},
"cell_type": "code",
"source": [
"# 3、不同类型房产和不同房间数的房价之间的比较使用pivot_table函数\n",
"data.pivot_table(values='房屋价格', index='房屋类型', columns='配套房间数', aggfunc='mean')"
],
"id": "4ed9b36daea1c503",
"outputs": [
{
"data": {
"text/plain": [
"配套房间数 0 1 2 3 \\\n",
"房屋类型 \n",
"house 677394.736842 353634.269663 489555.889339 560117.683516 \n",
"unit 330850.000000 336570.325391 432502.153116 594535.982287 \n",
"\n",
"配套房间数 4 5 \n",
"房屋类型 \n",
"house 730667.024375 9.290297e+05 \n",
"unit 641736.842105 1.146333e+06 "
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>配套房间数</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" </tr>\n",
" <tr>\n",
" <th>房屋类型</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>house</th>\n",
" <td>677394.736842</td>\n",
" <td>353634.269663</td>\n",
" <td>489555.889339</td>\n",
" <td>560117.683516</td>\n",
" <td>730667.024375</td>\n",
" <td>9.290297e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unit</th>\n",
" <td>330850.000000</td>\n",
" <td>336570.325391</td>\n",
" <td>432502.153116</td>\n",
" <td>594535.982287</td>\n",
" <td>641736.842105</td>\n",
" <td>1.146333e+06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 8
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T08:05:23.703349Z",
"start_time": "2025-04-02T08:05:23.691916Z"
}
},
"cell_type": "code",
"source": [
"# 4、不同地区不同房间数房屋销售情况交叉表使用crosstab函数参考例3-61\n",
"pd.crosstab(data['new_postcode'], data['配套房间数'])"
],
"id": "799d99489d93b2b5",
"outputs": [
{
"data": {
"text/plain": [
"配套房间数 0 1 2 3 4 5\n",
"new_postcode \n",
"26 24 1383 2815 6371 4793 1007\n",
"29 5 175 558 4557 4845 835"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>配套房间数</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" </tr>\n",
" <tr>\n",
" <th>new_postcode</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>24</td>\n",
" <td>1383</td>\n",
" <td>2815</td>\n",
" <td>6371</td>\n",
" <td>4793</td>\n",
" <td>1007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>5</td>\n",
" <td>175</td>\n",
" <td>558</td>\n",
" <td>4557</td>\n",
" <td>4845</td>\n",
" <td>835</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 10
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,464 @@
{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-04-08T10:57:29.263616Z",
"start_time": "2025-04-08T10:57:28.865194Z"
}
},
"source": "import pandas as pd",
"outputs": [],
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T10:59:11.991479Z",
"start_time": "2025-04-08T10:59:11.985778Z"
}
},
"cell_type": "code",
"source": [
"data = pd.read_csv('data/中国城市人口数据.csv',encoding=\"GBK\")\n",
"data.head(5)"
],
"id": "c3fd933261d1f7fb",
"outputs": [
{
"data": {
"text/plain": [
" 省份 2020年人口万人 2019年人口万人\n",
"0 河北省 7461 7447\n",
"1 山西省 3492 3497\n",
"2 辽宁省 4259 4277\n",
"3 吉林省 2407 2448\n",
"4 江苏省 8475 8469"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>省份</th>\n",
" <th>2020年人口万人</th>\n",
" <th>2019年人口万人</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>河北省</td>\n",
" <td>7461</td>\n",
" <td>7447</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>山西省</td>\n",
" <td>3492</td>\n",
" <td>3497</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>辽宁省</td>\n",
" <td>4259</td>\n",
" <td>4277</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>吉林省</td>\n",
" <td>2407</td>\n",
" <td>2448</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>江苏省</td>\n",
" <td>8475</td>\n",
" <td>8469</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:10:42.958649Z",
"start_time": "2025-04-08T11:10:42.956278Z"
}
},
"cell_type": "code",
"source": "data.shape",
"id": "a0d05b5dea7e5cfc",
"outputs": [
{
"data": {
"text/plain": [
"(22, 3)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 15
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:10:37.015499Z",
"start_time": "2025-04-08T11:10:37.005663Z"
}
},
"cell_type": "code",
"source": "data.info()",
"id": "b602f50b182485dd",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 22 entries, 0 to 21\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 省份 22 non-null object\n",
" 1 2020年人口万人 22 non-null int64 \n",
" 2 2019年人口万人 22 non-null int64 \n",
"dtypes: int64(2), object(1)\n",
"memory usage: 656.0+ bytes\n"
]
}
],
"execution_count": 13
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:01:22.100819Z",
"start_time": "2025-04-08T11:01:22.080321Z"
}
},
"cell_type": "code",
"source": "data.describe()",
"id": "1218c2b44c21d012",
"outputs": [
{
"data": {
"text/plain": [
" 2020年人口万人 2019年人口万人\n",
"count 22.000000 22.000000\n",
"mean 5482.772727 5478.500000\n",
"std 3067.216187 3043.789239\n",
"min 592.000000 590.000000\n",
"25% 3583.000000 3584.750000\n",
"50% 4620.000000 4615.000000\n",
"75% 7256.750000 7245.250000\n",
"max 12601.000000 12489.000000"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>2020年人口万人</th>\n",
" <th>2019年人口万人</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>22.000000</td>\n",
" <td>22.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>5482.772727</td>\n",
" <td>5478.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>3067.216187</td>\n",
" <td>3043.789239</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>592.000000</td>\n",
" <td>590.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>3583.000000</td>\n",
" <td>3584.750000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>4620.000000</td>\n",
" <td>4615.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>7256.750000</td>\n",
" <td>7245.250000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>12601.000000</td>\n",
" <td>12489.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 7
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:10:29.629195Z",
"start_time": "2025-04-08T11:10:29.625545Z"
}
},
"cell_type": "code",
"source": [
"# 2020年总人口\n",
"data['2020年人口万人'].sum()"
],
"id": "93faae0d69a5d4e2",
"outputs": [
{
"data": {
"text/plain": [
"120621"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 12
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:11:33.375487Z",
"start_time": "2025-04-08T11:11:33.361781Z"
}
},
"cell_type": "code",
"source": [
"# 2019年总人口\n",
"data['2019年人口万人'].sum()"
],
"id": "81d587e3605ba734",
"outputs": [
{
"data": {
"text/plain": [
"120527"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 16
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:13:18.823381Z",
"start_time": "2025-04-08T11:13:18.809605Z"
}
},
"cell_type": "code",
"source": [
"# 总人口对比\n",
"data['2020年人口万人'].sum() - data['2019年人口万人'].sum()"
],
"id": "2e5f8e1821c05cdf",
"outputs": [
{
"data": {
"text/plain": [
"94"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 20
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-08T11:13:20.739094Z",
"start_time": "2025-04-08T11:13:20.731449Z"
}
},
"cell_type": "code",
"source": [
"# 各省人口对比\n",
"data2 = data\n",
"data2['compare'] = data2['2020年人口万人'] - data2['2019年人口万人']\n",
"data2.head(5)"
],
"id": "e7bda6c2c79305bb",
"outputs": [
{
"data": {
"text/plain": [
" 省份 2020年人口万人 2019年人口万人 compare\n",
"0 河北省 7461 7447 14\n",
"1 山西省 3492 3497 -5\n",
"2 辽宁省 4259 4277 -18\n",
"3 吉林省 2407 2448 -41\n",
"4 江苏省 8475 8469 6"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>省份</th>\n",
" <th>2020年人口万人</th>\n",
" <th>2019年人口万人</th>\n",
" <th>compare</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>河北省</td>\n",
" <td>7461</td>\n",
" <td>7447</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>山西省</td>\n",
" <td>3492</td>\n",
" <td>3497</td>\n",
" <td>-5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>辽宁省</td>\n",
" <td>4259</td>\n",
" <td>4277</td>\n",
" <td>-18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>吉林省</td>\n",
" <td>2407</td>\n",
" <td>2448</td>\n",
" <td>-41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>江苏省</td>\n",
" <td>8475</td>\n",
" <td>8469</td>\n",
" <td>6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 21
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}