仅供学习
压缩文件大小:13,174,986,910 B
解压后产生 Hits3.txt, Hits4.txt, Hits6.txt, Hits7.txt, Hits8.txt, Hits9.txt 共6个文件
格式:Email: jo-mitten@hotmail.co.uk - Name: Joe Osullivan - ScreenName: JoeOsullivan2 - Followers: 263 - Created At: Sun Mar 04 14:35:29 +0000 2012
用pd.read_csv
进行处理时不能直接令sep='-'
,因为email和name字段都可能携带短杠
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import pandas as pd
from multiprocessing import Pool
def washdata(i):
chunk_size = 1000000 # 每次读取100万行数据
filepath = './Hits' + str(i) + '.txt'
outpath = './Hits' + str(i) + '_new.csv'
reader = pd.read_csv(filepath, sep=r'\s+-\s(?:Name|ScreenName|Followers|Created\sAt):\s', engine="python", header=None, chunksize=chunk_size)
for chunk in reader:
# 去除Email: 前缀
chunk[0] = chunk[0].str.replace(r'^Email:\s', '', regex=True)
# 假设createAt是最后一列,将其转换为正确的datetime格式
chunk.iloc[:, -1] = pd.to_datetime(chunk.iloc[:, -1], format='%a %b %d %H:%M:%S +0000 %Y', errors='coerce')
# 保存处理后的数据
chunk.to_csv(outpath, index=False, sep=',', mode='a')
if __name__ == '__main__':
files_to_process = [3, 4, 6, 7, 8, 9]
with Pool(processes=6) as pool: # 使用进程池,进程数量可以根据你的CPU核心数调整
pool.map(washdata, files_to_process)
建表:
1
2
3
4
5
6
7
8
9
10
11
12
13
CREATE TABLE `twitter` (
`email` varchar(255) DEFAULT NULL,
`name` varchar(100) DEFAULT NULL,
`screenName` varchar(20) DEFAULT NULL,
`followers` int(11) NOT NULL,
`createAt` datetime NOT NULL,
UNIQUE KEY `email` (`email`(50),`screenName`),
KEY `idx_screenName` (`screenName`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8mb4
PARTITION BY KEY (screenName)
PARTITIONS 10
-- utf8mb4 是因为存在表情符号字符
-- 只对email前50个字符建立索引
导入:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
@echo off
setlocal
set USER=your_username
set PASSWORD=your_password
set DATABASE=your_database
set TABLE=your_table
set HOST=localhost
set DATA_DIR=./
set MYSQL_PATH=mysql
for %%f in (%DATA_DIR%/*.csv) do (
echo Loading %%f into MySQL
"%MYSQL_PATH%" -h %HOST% -u %USER% -p%PASSWORD% %DATABASE% -e ^
"LOAD DATA LOCAL INFILE '%%f' INTO TABLE %TABLE% CHARACTER SET 'utf8mb4' FIELDS TERMINATED BY ',' LINES TERMINATED BY '\r\n' IGNORE 1 LINES (email, name, screenName, followers, createAt);"
)
echo Done.
endlocal
注意 CHARACTER SET ‘utf8mb4’ 放在 TABLE 后面