SGK

处理twitter

Posted by 3thernet on July 7, 2024

仅供学习

压缩文件大小:13,174,986,910 B

解压后产生 Hits3.txt, Hits4.txt, Hits6.txt, Hits7.txt, Hits8.txt, Hits9.txt 共6个文件

格式:Email: jo-mitten@hotmail.co.uk - Name: Joe Osullivan - ScreenName: JoeOsullivan2 - Followers: 263 - Created At: Sun Mar 04 14:35:29 +0000 2012

pd.read_csv进行处理时不能直接令sep='-',因为email和name字段都可能携带短杠

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import pandas as pd
from multiprocessing import Pool

def washdata(i):
    chunk_size = 1000000  # 每次读取100万行数据
    filepath = './Hits' + str(i) + '.txt'
    outpath = './Hits' + str(i) + '_new.csv'
    reader = pd.read_csv(filepath, sep=r'\s+-\s(?:Name|ScreenName|Followers|Created\sAt):\s', engine="python", header=None, chunksize=chunk_size)

    for chunk in reader:
        # 去除Email: 前缀
        chunk[0] = chunk[0].str.replace(r'^Email:\s', '', regex=True)

        # 假设createAt是最后一列,将其转换为正确的datetime格式
        chunk.iloc[:, -1] = pd.to_datetime(chunk.iloc[:, -1], format='%a %b %d %H:%M:%S +0000 %Y', errors='coerce')

        # 保存处理后的数据
        chunk.to_csv(outpath, index=False, sep=',', mode='a')

if __name__ == '__main__':
    files_to_process = [3, 4, 6, 7, 8, 9]
    with Pool(processes=6) as pool:  # 使用进程池,进程数量可以根据你的CPU核心数调整
        pool.map(washdata, files_to_process)

建表:

1
2
3
4
5
6
7
8
9
10
11
12
13
CREATE TABLE `twitter` (
  `email` varchar(255) DEFAULT NULL,
  `name` varchar(100) DEFAULT NULL,
  `screenName` varchar(20) DEFAULT NULL,
  `followers` int(11) NOT NULL,
  `createAt` datetime NOT NULL,
  UNIQUE KEY `email` (`email`(50),`screenName`),
  KEY `idx_screenName` (`screenName`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8mb4
PARTITION BY KEY (screenName)
PARTITIONS 10
-- utf8mb4 是因为存在表情符号字符
-- 只对email前50个字符建立索引

导入:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
@echo off
setlocal

set USER=your_username
set PASSWORD=your_password
set DATABASE=your_database
set TABLE=your_table
set HOST=localhost
set DATA_DIR=./
set MYSQL_PATH=mysql

for %%f in (%DATA_DIR%/*.csv) do (
    echo Loading %%f into MySQL
    "%MYSQL_PATH%" -h %HOST% -u %USER% -p%PASSWORD% %DATABASE% -e ^
    "LOAD DATA LOCAL INFILE '%%f' INTO TABLE %TABLE% CHARACTER SET 'utf8mb4' FIELDS TERMINATED BY ',' LINES TERMINATED BY '\r\n' IGNORE 1 LINES (email, name, screenName, followers, createAt);"
)

echo Done.
endlocal

注意 CHARACTER SET ‘utf8mb4’ 放在 TABLE 后面