起因

在幽兰代码本上运行Waydroid时可能会遇到无法启动的情况,通过查看dmesg信息可以知道,这是由于lmkd服务无法启动导致安卓的init进程启动失败。


[ 5007.044079] libprocessgroup: Successfully killed process cgroup uid 1069 pid 512 in 0ms
[ 5007.044325] init: critical process 'lmkd' exited 4 times before boot completed
[ 5007.050376] init: InitFatalReboot: signal 6
[ 5007.079301] init: #00 pc 00000000000aee54  /system/bin/init (android::init::InitFatalReboot(int)+208)
[ 5007.079471] init: #01 pc 000000000004d61c  /system/bin/init (android::init::InitAborter(char const*)+48)
[ 5007.079567] init: #02 pc 0000000000013978  /system/lib64/libbase.so (android::base::SetAborter(std::__1::function<void (char const*)>&&)::$_3::__invoke(char const*)+76)
[ 5007.079610] init: #03 pc 0000000000012fa4  /system/lib64/libbase.so (android::base::LogMessage::~LogMessage()+320)
[ 5007.079702] init: #04 pc 0000000000065294  /system/bin/init (android::init::Service::Reap(siginfo const&)+1336)
[ 5007.079739] init: #05 pc 00000000000b418c  /system/bin/init (android::init::ReapOneProcess()+496)
[ 5007.079775] init: #06 pc 00000000000b3f8c  /system/bin/init (android::init::ReapAnyOutstandingChildren()+8)
[ 5007.079812] init: #07 pc 0000000000084320  /system/bin/init (android::init::SecondStageMain(int, char**)+6964)
[ 5007.079848] init: #08 pc 000000000002f184  /system/bin/init (main+304)
[ 5007.079883] init: #09 pc 000000000008506c  /system/lib64/bootstrap/libc.so (__libc_init+108)
[ 5007.079969] init: Reboot ending, jumping to kernel
[ 5007.099096] init: InitFatalReboot: signal 6
[ 5007.104781] init: InitFatalReboot: signal 11
[ 5007.127388] init: #00 pc 00000000000aee54  /system/bin/init (android::init::InitFatalReboot(int)+208)
[ 5007.127450] init: #01 pc 00000000000af234  /system/bin/init (android::init::InstallRebootSignalHandlers()::$_22::__invoke(int)+32)
[ 5007.127487] init: #02 pc 00000000000007c0  [vdso:0000007fb83a6000]
[ 5007.127524] init: #03 pc 00000000000cec24  /system/bin/init (android::properties::PropertyInfoArea::GetPropertyInfoIndexes(char const*, unsigned int*, unsigned int*) const+28)
[ 5007.127563] init: #04 pc 00000000000cefc4  /system/bin/init (android::properties::PropertyInfoArea::GetPropertyInfo(char const*, char const**, char const**) const+56)
[ 5007.127600] init: #05 pc 00000000000cefc4  /system/bin/init (android::properties::PropertyInfoArea::GetPropertyInfo(char const*, char const**, char const**) const+56)
[ 5007.127643] init: #06 pc 000000000009b73c  /system/bin/init (android::init::CheckPermissions(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, ucred const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >*)+240)
[ 5007.127689] init: #07 pc 000000000009b89c  /system/bin/init (android::init::HandlePropertySet(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, ucred const&, android::init::SocketConnection*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >*)+68)
[ 5007.127726] init: #08 pc 00000000000a0ddc  /system/bin/init (android::init::handle_property_set_fd()+748)
[ 5007.127762] init: #09 pc 000000000009fee8  /system/bin/init (android::init::PropertyServiceThread()+704)
[ 5007.127794] init: #10 pc 000000000006e0a0  /system/bin/init
[ 5007.127830] init: #11 pc 00000000000ec8ec  /system/lib64/bootstrap/libc.so (__pthread_start(void*)+64)
[ 5007.127921] init: #12 pc 000000000008ba84  /system/lib64/bootstrap/libc.so (__start_thread+64)
[ 5007.127953] init: Reboot ending, jumping to kernel
[  377.026793] binder: release 3788:3788 transaction 21474 out, still active
[  377.026804] binder: undelivered TRANSACTION_COMPLETE
[  377.026866] binder: release 3768:3768 transaction 21208 out, still active
[  377.026871] binder: undelivered TRANSACTION_COMPLETE
[  377.027203] binder: release 3065:3129 transaction 21897 in, still active
[  377.027300] binder: send failed reply for transaction 21897 to 3279:3310
[  377.027420] binder: release 3065:3065 transaction 21898 out, still active
[  377.027427] binder: undelivered TRANSACTION_COMPLETE
[  377.027621] binder: release 2985:3087 transaction 21898 in, still active
[  377.027673] binder: send failed reply for transaction 21898, target dead

lmkd简介

lmkd的全称是低内存杀手守护进程(low memory killer daemon),由于安卓系统运行多个进程时可能会遇到内存耗尽的问题,所以会通过内存的cgroup进行内存分配,并通过lmkd对内存进行监视,在内存不足时杀死进程,降低内存压力。

lmkd通常使用vmPressurePSI监视内存压力信息,由于vmPressure通常包含大量误报, lmkd需要确定内存是否真的紧缺,导致消耗额外的资源,所以一般会使用 PSI进行更准确的内存压力监视,减小资源的开销。

内核支持

lmkd作为用户态程序无法代替操作系统进行硬件资源的管理,所以lmkd监视内存需要内核的帮助。

对于Linux内核而言,需要打开下面的选项开启cgroup和PSI的支持。

CONFIG_ANDROID_LOW_MEMORY_KILLER=n
CONFIG_MEMCG=y
CONFIG_MEMCG_SWAP=y
CONFIG_PSI=y

PSI的开启

此时lmkd服务启动失败,考虑到lmkd对内核的依赖,所以首先排查内核当中相关配置选择是否开启。

通过观察虚文件config.gz可以确定,lmkd需要的内核选项都已经处于开启状态。

CONFIG_MEMCG=y
CONFIG_MEMCG_SWAP=y
CONFIG_MEMCG_KMEM=y
CONFIG_SLUB_MEMCG_SYSFS_ON=y
# CONFIG_TEST_MEMCAT_P is not set

CONFIG_PSI=y
CONFIG_PSI_DEFAULT_DISABLED=y

由于dmesg中的报错信息存在binder通信失败的情况,考虑到驱动为了和用户态程序的通信,一般在用户空间中的proc目录下创建虚文件,所以此时排查psi驱动有没有建立虚文件,提供lmkd服务与内核通信的接口。

在查看proc目录后,可以确认psi并没有建立与用户态程序进行通信的接口,lmkd尝试借助binder与psi虚文件沟通自然也会失败。

ls -lh /proc/pressure/
ls: cannot access '/proc/pressure/': No such file or directory

在上面的内核选项中可以看到psi功能默认是禁用的,查询内核代码可以知道,只有当CONFIG_PSI_DEFAULT_DISABLED关闭时,psi_enable才会一直保持开启状态,否则则需要进行赋值。

#ifdef CONFIG_PSI_DEFAULT_DISABLED
static bool psi_enable;
#else
static bool psi_enable = true;
#endif
static int __init setup_psi(char *str)
{
    return kstrtobool(str, &psi_enable) == 0;
}
__setup("psi=", setup_psi);

psi_enable处于关闭状态时,会直接影响到psi驱动的初始化行为,并导致其不会创建虚文件。

void __init psi_init(void)
{
    if (!psi_enable) {
        static_branch_enable(&psi_disabled);
        return;
    }

    if (!cgroup_psi_enabled())
        static_branch_disable(&psi_cgroups_enabled);

    psi_period = jiffies_to_nsecs(PSI_FREQ);
    group_init(&psi_system);
}

static int __init psi_proc_init(void)
{
    if (psi_enable) {
        proc_mkdir("pressure", NULL);
        proc_create("pressure/io", 0, NULL, &psi_io_proc_ops);
        proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops);
        proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops);
    }
    return 0;
}

在内核代码中可以知道,当CONFIG_PSI_DEFAULT_DISABLED开启时,psi_enable的数值由__setup宏决定。

psi驱动通过__setup宏在.init.setup段中注册检索值psi=和处理函数setup_psi.

#define __setup_param(str, unique_id, fn, early)            \
    static const char __setup_str_##unique_id[] __initconst        \
        __aligned(1) = str;                     \
    static struct obs_kernel_param __setup_##unique_id        \
        __used __section(".init.setup")                \
        __attribute__((aligned((sizeof(long)))))        \
        = { __setup_str_##unique_id, fn, early }

当内核开始初始化时,内核启动函数会将内核命令行传递给unknown_bootoption

after_dashes = parse_args("Booting kernel",
                  static_command_line, __start___param,
                  __stop___param - __start___param,
                  -1, -1, NULL, &unknown_bootoption);

unknown_bootoption函数内会先通过obsolete_checksetup检查启动固件传递过来的内核命令行。

static int __init unknown_bootoption(char *param, char *val,
                     const char *unused, void *arg)
{
    size_t len = strlen(param);

    repair_env_string(param, val);

    /* Handle obsolete-style parameters */
    if (obsolete_checksetup(param))
        return 0;

    /* Unused module parameter. */
    if (strnchr(param, len, '.'))
        return 0;

    if (panic_later)
        return 0;

    if (val) {
        /* Environment option */
        unsigned int i;
        for (i = 0; envp_init[i]; i++) {
            if (i == MAX_INIT_ENVS) {
                panic_later = "env";
                panic_param = param;
            }
            if (!strncmp(param, envp_init[i], len+1))
                break;
        }
        envp_init[i] = param;
    } else {
        /* Command line option */
        unsigned int i;
        for (i = 0; argv_init[i]; i++) {
            if (i == MAX_INIT_ARGS) {
                panic_later = "init";
                panic_param = param;
            }
        }
        argv_init[i] = param;
    }
    return 0;
}

obsolete_checksetup函数会检查.init.setup中注册的信息,并根据对应的处理函数进行处理。

static bool __init obsolete_checksetup(char *line)
{
    const struct obs_kernel_param *p;
    bool had_early_param = false;

    p = __setup_start;
    do {
        int n = strlen(p->str);
        if (parameqn(line, p->str, n)) {
            if (p->early) {
                /* Already done in parse_early_param?
                 * (Needs exact match on param part).
                 * Keep iterating, as we can have early
                 * params and __setups of same names 8( */
                if (line[n] == '\0' || line[n] == '=')
                    had_early_param = true;
            } else if (!p->setup_func) {
                pr_warn("Parameter %s is obsolete, ignored\n",
                    p->str);
                return true;
            } else if (p->setup_func(line + n))
                return true;
        }
        p++;
    } while (p < __setup_end);

    return had_early_param;
}

所以当启动固件uboot传递的内核命令行包含psi=1时,psi_enable就会处于开启状态。

此时修改uboot传递的内核命令行添加psi=1后,再次启动系统,就可以发现psi的虚文件被正确注册,且Waydroid可以启动运行。

ls -lh /proc/pressure/
total 0
-r--r--r-- 1 root root 0 Apr 15 14:17 cpu
-r--r--r-- 1 root root 0 Apr 15 14:17 io
-r--r--r-- 1 root root 0 Apr 15 14:17 memory
作者:admin  创建时间:2024-04-15 15:21
最后编辑:admin  更新时间:2024-11-15 17:44