SGK

如何处理weibo,qq

Posted by 3thernet on July 11, 2024

仅供学习

weibo

建表:

1
2
3
4
5
6
7
8
CREATE TABLE `weibo` (
  `mobile` varchar(13) COLLATE utf8_unicode_ci NOT NULL,
  `uid` bigint(20) NOT NULL,
  UNIQUE KEY `mobile` (`mobile`,`uid`),
  KEY `idx_uid` (`uid`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci
PARTITION BY HASH (uid)
PARTITIONS 10 

拆分脚本:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os

def validate_format(line):
    parts = line.strip().split('\t')
    if len(parts) == 2:
        return True
    return False

def split_large_csv(input_file, output_dir, max_size_gb):
    # 计算最大文件大小(以字节为单位)
    max_size_bytes = max_size_gb * 1024 * 1024 * 1024

    # 初始化当前文件大小和块计数器
    current_size = 0
    chunk_count = 1

    # 打开文件保存格式不正确的行
    error_file = open(os.path.join(output_dir, 'error_lines'), 'w', encoding='utf-8')

    # 逐行读取输入文件
    with open(input_file, 'r', encoding='utf-8') as infile:
        outfile = None

        for line in infile:
            if not validate_format(line):
                # 将不正确的行写入错误文件
                error_file.write(line)
                continue  # 跳过后续处理,直接处理下一行

            line_size = len(line.encode('utf-8'))

            if outfile is None or current_size + line_size > max_size_bytes:
                if outfile:
                    outfile.close()
                outfile = open(os.path.join(output_dir, f'chunk_{chunk_count}.csv'), 'w', encoding='utf-8')
                chunk_count += 1
                current_size = 0

            outfile.write(line)
            current_size += line_size

        if outfile:
            outfile.close()

# 调用函数
split_large_csv('weibo.txt', '.', max_size_gb=1)

导入:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
@echo off
setlocal

set USER=your_username
set PASSWORD=your_password
set DATABASE=your_database
set TABLE=your_table
set HOST=localhost
set DATA_DIR=./
set MYSQL_PATH=mysql

for %%f in (%DATA_DIR%/*.csv) do (
    echo Loading %%f into MySQL
    "%MYSQL_PATH%" -h %HOST% -u %USER% -p%PASSWORD% %DATABASE% -e ^
    "LOAD DATA LOCAL INFILE '%%f' INTO TABLE %TABLE% FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\r\n' (mobile, uid);"
)

echo Done.
endlocal

中途我有因为硬盘容量不够导致卡死,清理了一波然后repair table weibo继续导入,最终数据量为503925364,相比5.38亿的原始数据量少了一些

qq

8e?

建表:

1
2
3
4
5
6
7
8
CREATE TABLE `qq` (
  `qq` bigint(20) NOT NULL,
  `mobile` varchar(13) COLLATE utf8_unicode_ci NOT NULL,
  PRIMARY KEY (`qq`,`mobile`),
  KEY `idx_mobile` (`mobile`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci
PARTITION BY HASH (qq)
PARTITIONS 10

拆分脚本略

导入:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
@echo off
setlocal

set USER=your_username
set PASSWORD=your_password
set DATABASE=your_database
set TABLE=your_table
set HOST=localhost
set DATA_DIR=./
set MYSQL_PATH=mysqlATH=mysql

for %%f in (%DATA_DIR%/*.csv) do (
    echo Loading %%f into MySQL
    "%MYSQL_PATH%" -h %HOST% -u %USER% -p%PASSWORD% %DATABASE% -e ^
    "LOAD DATA LOCAL INFILE '%%f' INTO TABLE %TABLE% FIELDS TERMINATED BY '----' LINES TERMINATED BY '\r\n' (qq, mobile);"
)

echo Done.
endlocal

实际上导入719543817条