dev_xulongjin 655911b748 chore(project): 初始化项目结构和配置
- 添加 .idea 目录和相关配置文件,设置项目忽略文件、编码、模块管理等
- 创建商务大数据分析目录和子目录,准备数据和任务笔记本
- 添加示例数据文件:中国城市人口数据.csv
- 创建任务笔记本文件,进行数据处理和分析示例
2025-04-14 16:06:13 +08:00

700 lines
20 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-04-02T07:51:13.983021Z",
"start_time": "2025-04-02T07:51:13.980852Z"
}
},
"source": "import pandas as pd",
"outputs": [],
"execution_count": 113
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.035104Z",
"start_time": "2025-04-02T07:51:14.008139Z"
}
},
"cell_type": "code",
"source": [
"data = pd.read_csv('data/某地区房屋销售数据 (1).csv', encoding='gbk')\n",
"data.head(5)"
],
"id": "6f3a167b4381943a",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数\n",
"0 2010/1/4 0:00 2615 435000 house 3\n",
"1 2010/1/5 0:00 2904 712000 house 4\n",
"2 2010/1/6 0:00 2617 435000 house 4\n",
"3 2010/1/6 0:00 2606 1350000 house 5\n",
"4 2010/1/7 0:00 2905 612500 house 4"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2617</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2606</td>\n",
" <td>1350000</td>\n",
" <td>house</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2010/1/7 0:00</td>\n",
" <td>2905</td>\n",
" <td>612500</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 114
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.079308Z",
"start_time": "2025-04-02T07:51:14.069694Z"
}
},
"cell_type": "code",
"source": [
"data['new_postcode'] = data['地区邮编'].apply(lambda x: str(x)[:2])\n",
"data.head(5)"
],
"id": "817b591e756eaf93",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数 new_postcode\n",
"0 2010/1/4 0:00 2615 435000 house 3 26\n",
"1 2010/1/5 0:00 2904 712000 house 4 29\n",
"2 2010/1/6 0:00 2617 435000 house 4 26\n",
"3 2010/1/6 0:00 2606 1350000 house 5 26\n",
"4 2010/1/7 0:00 2905 612500 house 4 29"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" <th>new_postcode</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2617</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2606</td>\n",
" <td>1350000</td>\n",
" <td>house</td>\n",
" <td>5</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2010/1/7 0:00</td>\n",
" <td>2905</td>\n",
" <td>612500</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 115
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.136665Z",
"start_time": "2025-04-02T07:51:14.129644Z"
}
},
"cell_type": "code",
"source": "data.groupby('new_postcode').agg({'房屋出售时间':'count'})",
"id": "4f648cd98de38213",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间\n",
"new_postcode \n",
"26 16393\n",
"29 10975"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" </tr>\n",
" <tr>\n",
" <th>new_postcode</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>16393</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>10975</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 116
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.229857Z",
"start_time": "2025-04-02T07:51:14.216154Z"
}
},
"cell_type": "code",
"source": [
"housesale1 = data.groupby(['房屋类型', 'new_postcode']).apply(lambda x:x).reset_index()\n",
"housesale1"
],
"id": "31e96124eb1769ea",
"outputs": [
{
"data": {
"text/plain": [
" index 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数 new_postcode\n",
"0 0 2010/1/4 0:00 2615 435000 house 3 26\n",
"1 1 2010/1/5 0:00 2904 712000 house 4 29\n",
"2 2 2010/1/6 0:00 2617 435000 house 4 26\n",
"3 3 2010/1/6 0:00 2606 1350000 house 5 26\n",
"4 4 2010/1/7 0:00 2905 612500 house 4 29\n",
"... ... ... ... ... ... ... ...\n",
"27363 27363 2019/7/25 0:00 2900 500000 unit 3 29\n",
"27364 27364 2019/7/25 0:00 2612 560000 unit 2 26\n",
"27365 27365 2019/7/26 0:00 2912 464950 unit 2 29\n",
"27366 27366 2019/7/26 0:00 2601 589000 unit 2 26\n",
"27367 27367 2019/7/26 0:00 2612 775000 unit 2 26\n",
"\n",
"[27368 rows x 7 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" <th>new_postcode</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2617</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>2010/1/6 0:00</td>\n",
" <td>2606</td>\n",
" <td>1350000</td>\n",
" <td>house</td>\n",
" <td>5</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>2010/1/7 0:00</td>\n",
" <td>2905</td>\n",
" <td>612500</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27363</th>\n",
" <td>27363</td>\n",
" <td>2019/7/25 0:00</td>\n",
" <td>2900</td>\n",
" <td>500000</td>\n",
" <td>unit</td>\n",
" <td>3</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27364</th>\n",
" <td>27364</td>\n",
" <td>2019/7/25 0:00</td>\n",
" <td>2612</td>\n",
" <td>560000</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27365</th>\n",
" <td>27365</td>\n",
" <td>2019/7/26 0:00</td>\n",
" <td>2912</td>\n",
" <td>464950</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27366</th>\n",
" <td>27366</td>\n",
" <td>2019/7/26 0:00</td>\n",
" <td>2601</td>\n",
" <td>589000</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27367</th>\n",
" <td>27367</td>\n",
" <td>2019/7/26 0:00</td>\n",
" <td>2612</td>\n",
" <td>775000</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>26</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>27368 rows × 7 columns</p>\n",
"</div>"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 117
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:51:14.304214Z",
"start_time": "2025-04-02T07:51:14.298702Z"
}
},
"cell_type": "code",
"source": "data['平均价格'] = data.groupby(['房屋类型', 'new_postcode'])['房屋价格'].transform('mean')",
"id": "5249fcce9b76b48f",
"outputs": [],
"execution_count": 118
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:52:09.492950Z",
"start_time": "2025-04-02T07:52:09.480225Z"
}
},
"cell_type": "code",
"source": [
"data2 = data.drop_duplicates(['房屋类型','new_postcode'],inplace=False)\n",
"data2"
],
"id": "93afa495c804a0f6",
"outputs": [
{
"data": {
"text/plain": [
" 房屋出售时间 地区邮编 房屋价格 房屋类型 配套房间数 new_postcode 平均价格\n",
"0 2010/1/4 0:00 2615 435000 house 3 26 725040.113978\n",
"1 2010/1/5 0:00 2904 712000 house 4 29 582085.199671\n",
"22595 2010/1/11 0:00 2602 270000 unit 1 26 434573.470446\n",
"22607 2010/2/9 0:00 2900 436000 unit 2 29 369109.530255"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋出售时间</th>\n",
" <th>地区邮编</th>\n",
" <th>房屋价格</th>\n",
" <th>房屋类型</th>\n",
" <th>配套房间数</th>\n",
" <th>new_postcode</th>\n",
" <th>平均价格</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2010/1/4 0:00</td>\n",
" <td>2615</td>\n",
" <td>435000</td>\n",
" <td>house</td>\n",
" <td>3</td>\n",
" <td>26</td>\n",
" <td>725040.113978</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2010/1/5 0:00</td>\n",
" <td>2904</td>\n",
" <td>712000</td>\n",
" <td>house</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" <td>582085.199671</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22595</th>\n",
" <td>2010/1/11 0:00</td>\n",
" <td>2602</td>\n",
" <td>270000</td>\n",
" <td>unit</td>\n",
" <td>1</td>\n",
" <td>26</td>\n",
" <td>434573.470446</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22607</th>\n",
" <td>2010/2/9 0:00</td>\n",
" <td>2900</td>\n",
" <td>436000</td>\n",
" <td>unit</td>\n",
" <td>2</td>\n",
" <td>29</td>\n",
" <td>369109.530255</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 125
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-04-02T07:52:17.149242Z",
"start_time": "2025-04-02T07:52:17.142432Z"
}
},
"cell_type": "code",
"source": "data2[['房屋类型','new_postcode','平均价格']]",
"id": "9dd96081baad6b3d",
"outputs": [
{
"data": {
"text/plain": [
" 房屋类型 new_postcode 平均价格\n",
"0 house 26 725040.113978\n",
"1 house 29 582085.199671\n",
"22595 unit 26 434573.470446\n",
"22607 unit 29 369109.530255"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>房屋类型</th>\n",
" <th>new_postcode</th>\n",
" <th>平均价格</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>house</td>\n",
" <td>26</td>\n",
" <td>725040.113978</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>house</td>\n",
" <td>29</td>\n",
" <td>582085.199671</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22595</th>\n",
" <td>unit</td>\n",
" <td>26</td>\n",
" <td>434573.470446</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22607</th>\n",
" <td>unit</td>\n",
" <td>29</td>\n",
" <td>369109.530255</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 126
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}