仅供学习
建表:
1
2
3
4
5
6
7
8
CREATE TABLE `weibo` (
`mobile` varchar(13) COLLATE utf8_unicode_ci NOT NULL,
`uid` bigint(20) NOT NULL,
UNIQUE KEY `mobile` (`mobile`,`uid`),
KEY `idx_uid` (`uid`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci
PARTITION BY HASH (uid)
PARTITIONS 10
拆分脚本:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
def validate_format(line):
parts = line.strip().split('\t')
if len(parts) == 2:
return True
return False
def split_large_csv(input_file, output_dir, max_size_gb):
# 计算最大文件大小(以字节为单位)
max_size_bytes = max_size_gb * 1024 * 1024 * 1024
# 初始化当前文件大小和块计数器
current_size = 0
chunk_count = 1
# 打开文件保存格式不正确的行
error_file = open(os.path.join(output_dir, 'error_lines'), 'w', encoding='utf-8')
# 逐行读取输入文件
with open(input_file, 'r', encoding='utf-8') as infile:
outfile = None
for line in infile:
if not validate_format(line):
# 将不正确的行写入错误文件
error_file.write(line)
continue # 跳过后续处理,直接处理下一行
line_size = len(line.encode('utf-8'))
if outfile is None or current_size + line_size > max_size_bytes:
if outfile:
outfile.close()
outfile = open(os.path.join(output_dir, f'chunk_{chunk_count}.csv'), 'w', encoding='utf-8')
chunk_count += 1
current_size = 0
outfile.write(line)
current_size += line_size
if outfile:
outfile.close()
# 调用函数
split_large_csv('weibo.txt', '.', max_size_gb=1)
导入:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
@echo off
setlocal
set USER=your_username
set PASSWORD=your_password
set DATABASE=your_database
set TABLE=your_table
set HOST=localhost
set DATA_DIR=./
set MYSQL_PATH=mysql
for %%f in (%DATA_DIR%/*.csv) do (
echo Loading %%f into MySQL
"%MYSQL_PATH%" -h %HOST% -u %USER% -p%PASSWORD% %DATABASE% -e ^
"LOAD DATA LOCAL INFILE '%%f' INTO TABLE %TABLE% FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\r\n' (mobile, uid);"
)
echo Done.
endlocal
中途我有因为硬盘容量不够导致卡死,清理了一波然后repair table weibo
继续导入,最终数据量为503925364
,相比5.38亿的原始数据量少了一些
8e?
建表:
1
2
3
4
5
6
7
8
CREATE TABLE `qq` (
`qq` bigint(20) NOT NULL,
`mobile` varchar(13) COLLATE utf8_unicode_ci NOT NULL,
PRIMARY KEY (`qq`,`mobile`),
KEY `idx_mobile` (`mobile`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci
PARTITION BY HASH (qq)
PARTITIONS 10
拆分脚本略
导入:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
@echo off
setlocal
set USER=your_username
set PASSWORD=your_password
set DATABASE=your_database
set TABLE=your_table
set HOST=localhost
set DATA_DIR=./
set MYSQL_PATH=mysqlATH=mysql
for %%f in (%DATA_DIR%/*.csv) do (
echo Loading %%f into MySQL
"%MYSQL_PATH%" -h %HOST% -u %USER% -p%PASSWORD% %DATABASE% -e ^
"LOAD DATA LOCAL INFILE '%%f' INTO TABLE %TABLE% FIELDS TERMINATED BY '----' LINES TERMINATED BY '\r\n' (qq, mobile);"
)
echo Done.
endlocal
实际上导入719543817条