Keep learning, keep living...

0%

Linux core dump过程分析

Linux进程在运行中异常中止或者崩溃,Linux会将进程当时的内存信息、寄存器信息、函数调用堆栈等信息存储到文件系统的一个文件中,这个过程叫做core dump, 生成的core文件可以使用gdb来分析诊断程序存在的问题。最近对core dump的生成过程及相应进程退出的过程有些疑惑。究竟是崩溃进程的内核task退出前生成core文件还是崩溃的task先退出之后kernel检测到进程崩溃后额外来生成的呢?带着这个疑问去做些了实验和代码分析。

默认情况下,core dump文件被命名为core.pid, 如core.12345。内核参数/proc/sys/kernel/core_pattern可以定义core dump文件名模板。具体模板参数可以参考文档。但内核有个小巧特性,如果core_pattern的第一个字符为管道符:|, 内核会执行|之后的配置的可执行程序,并通过pipecore dump内容传给这个用户态程序。

ABRT:Automated Bug Reporting Tool服务收集应用崩溃的core文件就是使用这个机制,abrt-ccpp服务会将core_pattern设置为:

1
|/usr/libexec/abrt-hook-ccpp %s %c %p %u %g %t e %P %I %h

我们先编写一个运行即崩溃的简单程序, bad.c内容:

1
2
3
4
5
6
7
#include <stdio.h>

int main(int argc, char **argv) {
char *p = NULL;
char c = *p;
return 0;
}

修改当前shellcoredump文件大小为不限制:

1
ulimit -c unlimited

编译完运行程序,生成coredump文件,可以看到进程很快就退出了:

1
2
3
4
5
6
7
[root@default ~]# gcc -g bad.c -o bad
[root@default ~]# time ./bad
Segmentation fault (core dumped)

real 0m0.007s
user 0m0.000s
sys 0m0.006s

coredump helper脚本/root/core.dumper内容如下:

1
2
3
4
#!/bin/bash
/usr/bin/logger "core.dumper: start sleeping 60 seconds"
sleep 60
/usr/bin/logger "core.dumper: end"

添加执行权限:

1
chmod a+x /root/core.dumper

修改core_pattern:

1
2
3
4
5
[root@default ~]# cat /proc/sys/kernel/core_pattern
core
[root@default ~]# echo '|/root/core.dumper' > /proc/sys/kernel/core_pattern
[root@default ~]# cat /proc/sys/kernel/core_pattern
|/root/core.dumper

再次运行异常程序, 可以看到程序60秒之后才退出返回:

1
2
3
4
5
6
[root@default ~]# time ./bad
Segmentation fault (core dumped)

real 1m0.034s
user 0m0.003s
sys 0m0.006s

从另一个shell查看bad进程, bad进程一直存在:

1
2
3
[root@default ~]# ps aux |grep bad
root 5932 0.0 0.0 4212 348 pts/0 S+ 03:53 0:00 ./bad
root 6030 0.0 0.1 12528 972 pts/1 R+ 03:53 0:00 grep --color=auto bad

再查看该进程的函数调用堆栈:

1
2
3
4
5
6
7
8
9
10
11
12
[root@default ~]# cat /proc/5932/stack
[<ffffffff9a656730>] pipe_wait+0x70/0xc0
[<ffffffff9a6569d9>] pipe_write+0x1f9/0x540
[<ffffffff9a64c663>] do_sync_write+0x93/0xe0
[<ffffffff9a6b80c8>] dump_write+0x58/0x70
[<ffffffff9a6b31df>] elf_core_dump+0x84f/0x960
[<ffffffff9a6b9117>] do_coredump+0x827/0xac0
[<ffffffff9a4b3ae5>] get_signal_to_deliver+0x1c5/0x5e0
[<ffffffff9a42c527>] do_signal+0x57/0x6f0
[<ffffffff9a42cc32>] do_notify_resume+0x72/0xc0
[<ffffffff9ab8957c>] retint_signal+0x48/0x8c
[<ffffffffffffffff>] 0xffffffffffffffff

可以看到该进程在等待pipe写端返回,确定是进程在退出前生成core dump文件。

下面在来从内核代码看具体过程,这里参考的是CentOSkernel-3.10.0-1160.el7的源码,信号处理的逻辑与CPU架构相关,这里参考x86架构实现。

上边的异常程序执行时访问空指针会产生SIGSEGV信号,Linux内核会把信号放入task的信号队列中,在内核态返回用户态时调用do_notify_resume函数(arch/x86/kernel/signal.c)来处理进程收到的信号:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/*
* notification of userspace execution resumption
* - triggered by the TIF_WORK_MASK flags
*/
void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
user_exit();

if (thread_info_flags & _TIF_UPROBE)
uprobe_notify_resume(regs);

if (thread_info_flags & _TIF_PATCH_PENDING)
klp_update_patch_state(current);

/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);

if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
}
if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();

user_enter();
}

_TIF_SIGPENDING表示有信号需要处理, 调用do_signal来处理:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
/*
* Note that 'init' is a special process: it doesn't get signals it doesn't
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
static void do_signal(struct pt_regs *regs)
{
struct ksignal ksig;

if (get_signal(&ksig)) {
/* Whee! Actually deliver the signal. */
handle_signal(&ksig, regs);
return;
}

/* Did we come from a system call? */
if (syscall_get_nr(current, regs) >= 0) {
/* Restart the system call - no handlers present */
switch (syscall_get_error(current, regs)) {
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
regs->ax = regs->orig_ax;
regs->ip -= 2;
break;

case -ERESTART_RESTARTBLOCK:
regs->ax = NR_restart_syscall;
regs->ip -= 2;
break;
}
}

/*
* If there's no signal to deliver, we just put the saved sigmask
* back.
*/
restore_saved_sigmask();
}

其中get_signal是一个宏, 会调用get_signal_to_delicer来处理信号:

1
2
3
4
5
6
7
8
9
10
11
/*
* Eventually that'll replace get_signal_to_deliver(); macro for now,
* to avoid nastiness with include order.
*/
#define get_signal(ksig) \
({ \
struct ksignal *p = (ksig); \
p->sig = get_signal_to_deliver(&p->info, &p->ka, \
signal_pt_regs(), NULL);\
p->sig > 0; \
})

get_signal_to_deliver是一个比较长的函数, 会循环处理进程所接受到的信号:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
for (;;) {
struct k_sigaction *ka;
...
signr = dequeue_signal(current, &current->blocked, info);

if (!signr)
break; /* will return 0 */
...
ka = &sighand->action[signr-1];

/* Trace actually delivered signals. */
trace_signal_deliver(signr, info, ka);

if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
continue;
if (ka->sa.sa_handler != SIG_DFL) {
/* Run the handler. */
*return_ka = *ka;

if (ka->sa.sa_flags & SA_ONESHOT)
ka->sa.sa_handler = SIG_DFL;

break; /* will return non-zero "signr" value */
}
...
/*
* Anything else is fatal, maybe with a core dump.
*/
current->flags |= PF_SIGNALED;

if (sig_kernel_coredump(signr)) {
if (print_fatal_signals)
print_fatal_signal(info->si_signo);
proc_coredump_connector(current);
/*
* If it was able to dump core, this kills all
* other threads in the group and synchronizes with
* their demise. If we lost the race with another
* thread getting here, it set group_exit_code
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
do_coredump(info);
}

/*
* Death signals, no core dump.
*/
do_group_exit(info->si_signo);
/* NOTREACHED */
}

函数检查信号是否应该忽略、交由用户态程序处理,或者停止所有线程等条件,最后调用sig_kernel_coredump检测是否应该生成coredump文件:

1
2
#define sig_kernel_coredump(sig) \
(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_COREDUMP_MASK))

而符合这个条件的信号有以下这些:

  • SIGQUIT
  • SIGILL
  • SIGTRAP
  • SIGABRT
  • SIGGFPE
  • SIGSEGV
  • SIGBUS
  • SIGSYS
  • SIGXCPU
  • SIGXFSZ
1
2
3
4
5
6
7
#define SIG_KERNEL_COREDUMP_MASK (\
rt_sigmask(SIGQUIT) | rt_sigmask(SIGILL) | \
rt_sigmask(SIGTRAP) | rt_sigmask(SIGABRT) | \
rt_sigmask(SIGFPE) | rt_sigmask(SIGSEGV) | \
rt_sigmask(SIGBUS) | rt_sigmask(SIGSYS) | \
rt_sigmask(SIGXCPU) | rt_sigmask(SIGXFSZ) | \
SIGEMT_MASK )

我们的进程收到了SIGSEGV信号,因而会调用do_coredump函数(fs/coredump.c)生成coredump文件。do_coredump会调用rlimit获取进程的core文件大小限制:

1
2
3
4
5
6
7
8
9
10
11
struct coredump_params cprm = {
.siginfo = siginfo,
.regs = signal_pt_regs(),
.limit = rlimit(RLIMIT_CORE),
/*
* We must use the same mm->flags while dumping core to avoid
* inconsistency of bit flags, since this flag is not protected
* by any locks.
*/
.mm_flags = mm->flags,
};

之后会检测当前进程的加载器是否有core_dump指针:

1
2
3
binfmt = mm->binfmt;
if (!binfmt || !binfmt->core_dump)
goto fail;

我们的进程在管理符模式因特殊原因不允许产生core文件,可以通过在内核模块中修改binfmt->core_dumpNULL来实现。

继续代码分析,之后会调用format_corename来获取coredump文件名相关参数:

1
2
3
4
5
6
7
8
9
10
11
12
13
/* format_corename will inspect the pattern parameter, and output a
* name into corename, which must have space for at least
* CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
*/
static int format_corename(struct core_name *cn, struct coredump_params *cprm)
{
const struct cred *cred = current_cred();
const char *pat_ptr = core_pattern;
int ispipe = (*pat_ptr == '|');
...
out:
return ispipe;
}

可以看到管理符|就是在这里判断的。

之后根据ispipe进行不同的分支处理:

1
2
3
4
5
6
7
ispipe = format_corename(&cn, &cprm);

if (ispipe) {
//处理
} else {
...
}

我们看ispipe1的分支:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
if (ispipe) {
int dump_count;
char **helper_argv;
struct subprocess_info *sub_info;

if (ispipe < 0) {
printk(KERN_WARNING "format_corename failed\n");
printk(KERN_WARNING "Aborting core\n");
goto fail_corename;
}

if (cprm.limit == 1) {
/* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
*
* Normally core limits are irrelevant to pipes, since
* we're not writing to the file system, but we use
* cprm.limit of 1 here as a speacial value, this is a
* consistent way to catch recursive crashes.
* We can still crash if the core_pattern binary sets
* RLIM_CORE = !1, but it runs as root, and can do
* lots of stupid things.
*
* Note that we use task_tgid_vnr here to grab the pid
* of the process group leader. That way we get the
* right pid if a thread in a multi-threaded
* core_pattern process dies.
*/
printk(KERN_WARNING
"Process %d(%s) has RLIMIT_CORE set to 1\n",
task_tgid_vnr(current), current->comm);
printk(KERN_WARNING "Aborting core\n");
goto fail_unlock;
}
cprm.limit = RLIM_INFINITY;

dump_count = atomic_inc_return(&core_dump_count);
if (core_pipe_limit && (core_pipe_limit < dump_count)) {
printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
task_tgid_vnr(current), current->comm);
printk(KERN_WARNING "Skipping core dump\n");
goto fail_dropcount;
}

helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
if (!helper_argv) {
printk(KERN_WARNING "%s failed to allocate memory\n",
__func__);
goto fail_dropcount;
}

retval = -ENOMEM;
sub_info = call_usermodehelper_setup(helper_argv[0],
helper_argv, NULL, GFP_KERNEL,
umh_pipe_setup, NULL, &cprm);
if (sub_info)
retval = call_usermodehelper_exec(sub_info,
UMH_WAIT_EXEC);

argv_free(helper_argv);
if (retval) {
printk(KERN_INFO "Core dump to %s pipe failed\n",
cn.corename);
goto close_fail;
}
}

cprm.limit1时,会跳过调用用户态coredump helper执行。上边提到在配置coredump helper情况下我们的进程强制不允许生成core,也可以在进程中调用setrlimit来设置RLIMIT_CORE1:

1
2
3
4
5
6
struct rlimit rlimit;
rlimit.rlim_cur = 1;
rlimit.rlim_max = 1;
if (setrlimit(RLIMIT_CORE, &rlimit) != 0) {
...
}

需要注意的是,不能使用ulimit命令来设置的原因是因为ulimit -c设置的单位为block, 为1024字节。执行ulimit -c 1, 进程的RLIMIT_CORE实际为1024

这种方式不舒服的点在于在更低版本的内核(比如:2.6.32)判断条件是RLIMIT_CORE0。为了兼容性需要在不同版本内核上使用不同的值。

继续看代码,处理完条件检查和参数解析之后,调用call_usermodehelper_setup构造内核work_struct去执行用户态程序:

1
2
3
sub_info = call_usermodehelper_setup(helper_argv[0],
helper_argv, NULL, GFP_KERNEL,
umh_pipe_setup, NULL, &cprm);

之后调用call_usermodehelper_exec将创建的work_struct放入系统队列, 等待work_struct关联的函数__call_usermodehelper完成:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
queue_work(system_unbound_wq, &sub_info->work);
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;

if (wait & UMH_KILLABLE) {
retval = wait_for_completion_killable(&done);
if (!retval)
goto wait_done;

/* umh_complete() will see NULL and free sub_info */
if (xchg(&sub_info->complete, NULL))
goto unlock;
/* fallthrough, umh_complete() was already called */
}

wait_for_completion(&done);

因为call_usermodehelper_exec函数中传入的wait参数为UMH_WAIT_EXEC__call_usermodehelper会调用kernel_thread去启动一个内核进程去执行call_helper, 之后调用umh_completecall_usermodehelper_exec中的wait_for_completion(&done)返回:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
/* This is run by khelper thread  */
static void __call_usermodehelper(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
int wait = sub_info->wait & ~UMH_KILLABLE;
pid_t pid;

/* CLONE_VFORK: wait until the usermode helper has execve'd
* successfully We need the data structures to stay around
* until that is done. */
if (wait == UMH_WAIT_PROC)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
else {
pid = kernel_thread(call_helper, sub_info,
CLONE_VFORK | SIGCHLD);
/* Worker thread stopped blocking khelper thread. */
kmod_thread_locker = NULL;
}

switch (wait) {
case UMH_NO_WAIT:
call_usermodehelper_freeinfo(sub_info);
break;

case UMH_WAIT_PROC:
if (pid > 0)
break;
/* FALLTHROUGH */
case UMH_WAIT_EXEC:
if (pid < 0)
sub_info->retval = pid;
umh_complete(sub_info);
}
}

call_helper最终会调用到____call_usermodehelper

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
if (sub_info->init) {
retval = sub_info->init(sub_info, new);
if (retval) {
abort_creds(new);
goto fail;
}
}

commit_creds(new);

retval = do_execve(getname_kernel(sub_info->path),
(const char __user *const __user *)sub_info->argv,
(const char __user *const __user *)sub_info->envp);
if (!retval)
return 0;

在调用do_execve执行用户态进程前会先调用call_usermodehelper_setup通过sub_info->init传入的umh_pipe_setup函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/*
* umh_pipe_setup
* helper function to customize the process used
* to collect the core in userspace. Specifically
* it sets up a pipe and installs it as fd 0 (stdin)
* for the process. Returns 0 on success, or
* PTR_ERR on failure.
* Note that it also sets the core limit to 1. This
* is a special value that we use to trap recursive
* core dumps
*/
static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
{
struct file *files[2];
struct coredump_params *cp = (struct coredump_params *)info->data;
int err = create_pipe_files(files, 0);
if (err)
return err;

cp->file = files[1];

err = replace_fd(0, files[0], 0);
fput(files[0]);
/* and disallow core files too */
current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};

return err;
}

这个函数会创建pipe结构, 在参数里保存pipe写端fd,然后用fd:0保存pipe读端,这样运行的用户态程序能从fd:0获取到coredump内容。之后设置当前taskRLIMIT_CORE{1, 1}, 避免coredump helper也有异常形成coredump从而造成死循环不停递归调用coredump helper

最后会调用binfmt->core_dump生成coredump文件内容,写入到cprm->file中。在ispipe1的情况下,cprm->file为之前创建的pipe写端:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
    /* get us an unshared descriptor table; almost always a no-op */
retval = unshare_files(&displaced);
if (retval)
goto close_fail;
if (displaced)
put_files_struct(displaced);
if (!dump_interrupted()) {
file_start_write(cprm.file);
core_dumped = binfmt->core_dump(&cprm);
file_end_write(cprm.file);
}
if (ispipe && core_pipe_limit)
wait_for_dump_helpers(cprm.file);
close_fail:
if (cprm.file)
filp_close(cprm.file, NULL);

因而用户态程序能从标准输入fd:0获取到coredump文件内容。由于现在版本的内核的递归coredump的判断标准为RLIMIT_CORE1, 也就是当使用coredump helper的情况下,即使设置了ulimit -c 0,依然会执行coredump helper

还有一点需要注意,上边我们实验中bad进程会等到coredump helper进程退出。但实际上,内核并不会等待coredump helper执行完成。core_dump函数完成后,就会执行到filp_close关闭pipe写端。我们上边的bad进程一直在是因为我们的coredump helper没有读pipe, 因而内核内写pipe会阻塞。

我们重改一下coredump helper的来验证一下。因为bash不方便读写二进制数据,我们用perl来从标准输入读取coredump内容, /root/core.dumper源码修改为:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#!/usr/bin/perl

use strict;

my $core_name = "/root/pipecore." . $ARGV[0];
open(my $outfh, '+>', $core_name) or die "open file failed: $!";
binmode($outfh);

my $n = 0;
my $buffer;
while (1) {
$n = read(STDIN, $buffer, 4096);
last if ($n == 0 or $n < 0);
print $outfh $buffer;
}

close($outfh);

sleep(60);

修改core_pattern参数将进程号传递给core.dumper:

1
echo "|/root/core.dumper %p" > /proc/sys/kernel/core_pattern

之后执行两次异常程序,可以看到都很快返回:

1
2
3
4
5
6
7
8
9
10
11
12
[root@default ~]# time ./bad
Segmentation fault (core dumped)

real 0m0.019s
user 0m0.001s
sys 0m0.003s
[root@default ~]# time ./bad
Segmentation fault (core dumped)

real 0m0.017s
user 0m0.000s
sys 0m0.011s

查看运行的perl进程, 可以看到两个perl进程在运行:

1
2
3
4
[root@default ~]# ps aux |grep perl
root 5689 0.0 0.2 26132 2132 ? S 09:17 0:00 /usr/bin/perl /root/core.dumper 5688
root 5700 0.0 0.2 26132 2132 ? S 09:17 0:00 /usr/bin/perl /root/core.dumper 5699
root 5705 0.0 0.1 12528 976 pts/0 R+ 09:17 0:00 grep --color=auto perl

查看生成的core文件的进程号,正与perl进程的参数匹配:

1
2
3
[root@default ~]# ls -l pipecore.*
-rw-rw-rw-. 1 root root 245760 Oct 31 09:17 pipecore.5688
-rw-rw-rw-. 1 root root 245760 Oct 31 09:17 pipecore.5699

查看两个进程的函数调用堆栈, 全部在运行sleep:

1
2
3
4
5
6
7
8
9
10
[root@default ~]# cat /proc/5689/stack
[<ffffffff9a4cb53b>] hrtimer_nanosleep+0xbb/0x180
[<ffffffff9a4cb696>] SyS_nanosleep+0x96/0xb0
[<ffffffff9ab92ed2>] system_call_fastpath+0x25/0x2a
[<ffffffffffffffff>] 0xffffffffffffffff
[root@default ~]# cat /proc/5700/stack
[<ffffffff9a4cb53b>] hrtimer_nanosleep+0xbb/0x180
[<ffffffff9a4cb696>] SyS_nanosleep+0x96/0xb0
[<ffffffff9ab92ed2>] system_call_fastpath+0x25/0x2a
[<ffffffffffffffff>] 0xffffffffffffffff

gdb检验一下生成的core文件:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
[root@default ~]# gdb bad pipecore.5688
GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-119.el7
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /root/bad...done.
[New LWP 5688]
Core was generated by `./bad'.
Program terminated with signal 11, Segmentation fault.
#0 0x0000000000400504 in main (argc=1, argv=0x7fff56990d58) at bad.c:5
5 char c = *p;
Missing separate debuginfos, use: debuginfo-install glibc-2.17-307.el7.1.x86_64
(gdb)

文件正确,正常显示了崩溃的位置。

参考: