/** * Low-level function to reboot the device. On success, this * function doesn't return. If more than 20 seconds passes from * the time a reboot is requested, this method returns. * * @param reason code to pass to the kernel (e.g. "recovery"), or null. */ publicstaticvoidlowLevelReboot(String reason) { if (reason == null) { reason = ""; }
// If the reason is "quiescent", it means that the boot process should proceed // without turning on the screen/lights. // The "quiescent" property is sticky, meaning that any number // of subsequent reboots should honor the property until it is reset. if (reason.equals(PowerManager.REBOOT_QUIESCENT)) { sQuiescent = true; reason = ""; } elseif (reason.endsWith("," + PowerManager.REBOOT_QUIESCENT)) { sQuiescent = true; reason = reason.substring(0, reason.length() - PowerManager.REBOOT_QUIESCENT.length() - 1); }
if (reason.equals(PowerManager.REBOOT_RECOVERY) || reason.equals(PowerManager.REBOOT_RECOVERY_UPDATE)) { reason = "recovery"; }
if (sQuiescent) { // Pass the optional "quiescent" argument to the bootloader to let it know // that it should not turn the screen/lights on. reason = reason + ",quiescent"; }
voidproperty_changed(const std::string& name, const std::string& value){ // If the property is sys.powerctl, we bypass the event queue and immediately handle it. // This is to ensure that init will always and immediately shutdown/reboot, regardless of // if there are other pending events to process or if init is waiting on an exec service or // waiting on a property. // In non-thermal-shutdown case, 'shutdown' trigger will be fired to let device specific // commands to be executed. if (name == "sys.powerctl") { // Despite the above comment, we can't call HandlePowerctlMessage() in this function, // because it modifies the contents of the action queue, which can cause the action queue // to get into a bad state if this function is called from a command being executed by the // action queue. Instead we set this flag and ensure that shutdown happens before the next // command is run in the main init loop. // TODO: once property service is removed from init, this will never happen from a builtin, // but rather from a callback from the property service socket, in which case this hack can // go away. shutdown_command = value; do_shutdown = true; }
init.cpp的main():
while (true) { // By default, sleep until something happens. int epoll_timeout_ms = -1;
if (do_shutdown && !shutting_down) { do_shutdown = false; if (HandlePowerctlMessage(shutdown_command)) { shutting_down = true; } }
/* Android reboot reason stored in this property */ #define LAST_REBOOT_REASON_PROPERTY "persist.sys.boot.reason"
复现了一次抓到如下log,是通过adb reboot压测的:
[ 36.255358] init: Received sys.powerctl='reboot,adb' from pid: 2491 (/system/bin/adbd) [ 36.255533] init: Clear action queue and start shutdown trigger [ 36.255731] init: processing action (shutdown_done) from (<Builtin Action>:0) [ 36.255759] init: Reboot start, reason: reboot,adb, rebootTarget: adb [ 36.291254] init: Shutdown timeout: 6000 ms [ 36.291385] init: Could not start shutdown critical service 'chre': Cannot find '/vendor/bin/chre': No such file or directory [ 36.292815] init: starting service 'blank_screen'... [ 36.294620] init: terminating init services [ 36.294804] init: Sending signal 15 to service 'gx_fpd' (pid 3714) process group... [ 36.295203] init: Sending signal 15 to service 'trustonic-daemon' (pid 3295) process group...
继续跟下代码:
// keep debugging tools until non critical ones are all gone. const std::set<std::string> kill_after_apps{"tombstoned", "logd", "adbd"}; // watchdogd is a vendor specific component but should be alive to complete shutdown safely. const std::set<std::string> to_starts{"watchdogd"}; for (constauto& s : ServiceList::GetInstance()) { if (kill_after_apps.count(s->name())) { s->SetShutdownCritical(); } elseif (to_starts.count(s->name())) { if (auto result = s->Start(); !result) { LOG(ERROR) << "Could not start shutdown 'to_start' service '" << s->name() << "': " << result.error(); } s->SetShutdownCritical(); } elseif (s->IsShutdownCritical()) { // Start shutdown critical service if not started. if (auto result = s->Start(); !result) { LOG(ERROR) << "Could not start shutdown critical service '" << s->name() << "': " << result.error(); } } }
// remaining operations (specifically fsck) may take a substantial duration if (cmd == ANDROID_RB_POWEROFF || is_thermal_shutdown) { TurnOffBacklight(); }
Service* bootAnim = ServiceList::GetInstance().FindService("bootanim"); Service* surfaceFlinger = ServiceList::GetInstance().FindService("surfaceflinger"); if (bootAnim != nullptr && surfaceFlinger != nullptr && surfaceFlinger->IsRunning()) { // will not check animation class separately for (constauto& service : ServiceList::GetInstance()) { if (service->classnames().count("animation")) service->SetShutdownCritical(); } }
// optional shutdown step // 1. terminate all services except shutdown critical ones. wait for delay to finish if (shutdown_timeout > 0ms) { LOG(INFO) << "terminating init services";
// Ask all services to terminate except shutdown critical ones. for (constauto& s : ServiceList::GetInstance().services_in_shutdown_order()) { if (!s->IsShutdownCritical()) s->Terminate(); }
int service_count = 0; // Only wait up to half of timeout here auto termination_wait_timeout = shutdown_timeout / 2; while (t.duration() < termination_wait_timeout) { ReapAnyOutstandingChildren();
service_count = 0; for (constauto& s : ServiceList::GetInstance()) { // Count the number of services running except shutdown critical. // Exclude the console as it will ignore the SIGTERM signal // and not exit. // Note: SVC_CONSOLE actually means "requires console" but // it is only used by the shell. if (!s->IsShutdownCritical() && s->pid() != 0 && (s->flags() & SVC_CONSOLE) == 0) { service_count++; } }
if (service_count == 0) { // All terminable services terminated. We can exit early. break; }
// Wait a bit before recounting the number or running services. std::this_thread::sleep_for(50ms); } LOG(INFO) << "Terminating running services took " << t << " with remaining services:" << service_count; }
这里会有个等待超时时间termination_wait_timeout,调大些?继续看:
// minimum safety steps before restarting // 2. kill all services except ones that are necessary for the shutdown sequence. for (constauto& s : ServiceList::GetInstance().services_in_shutdown_order()) { if (!s->IsShutdownCritical()) s->Stop(); //tj: stop non critical } ReapAnyOutstandingChildren();
// 3. send volume shutdown to vold Service* voldService = ServiceList::GetInstance().FindService("vold"); if (voldService != nullptr && voldService->IsRunning()) { ShutdownVold(); voldService->Stop(); } else { LOG(INFO) << "vold not running, skipping vold shutdown"; } // logcat stopped here for (constauto& s : ServiceList::GetInstance().services_in_shutdown_order()) { if (kill_after_apps.count(s->name())) s->Stop(); }
// 4. sync, try umount, and optionally run fsck for user shutdown { Timer sync_timer; LOG(INFO) << "sync() before umount..."; sync(); LOG(INFO) << "sync() before umount took" << sync_timer; } UmountStat stat = TryUmountAndFsck(runFsck, shutdown_timeout - t.duration()); // Follow what linux shutdown is doing: one more sync with little bit delay { Timer sync_timer; LOG(INFO) << "sync() after umount..."; sync(); LOG(INFO) << "sync() after umount took" << sync_timer; }
/* Try umounting all emulated file systems R/W block device cfile systems. * This will just try umount and give it up if it fails. * For fs like ext4, this is ok as file system will be marked as unclean shutdown * and necessary check can be done at the next reboot. * For safer shutdown, caller needs to make sure that * all processes / emulated partition for the target fs are all cleaned-up. * * return true when umount was successful. false when timed out. */ static UmountStat TryUmountAndFsck(bool runFsck, std::chrono::milliseconds timeout){ Timer t; std::vector<MountEntry> block_devices; std::vector<MountEntry> emulated_devices;
if (runFsck && !FindPartitionsToUmount(&block_devices, &emulated_devices, false)) { return UMOUNT_STAT_ERROR; }
UmountStat stat = UmountPartitions(timeout - t.duration()); if (stat != UMOUNT_STAT_SUCCESS) { LOG(INFO) << "umount timeout, last resort, kill all and try"; bool dumpUmountDebugInfo = property_get_bool("persist.sys.dumpUmountDebugInfo",false); if (dumpUmountDebugInfo) { if (DUMP_ON_UMOUNT_FAILURE) DumpUmountDebuggingInfo(true); } KillAllProcesses(); // even if it succeeds, still it is timeout and do not run fsck with all processes killed UmountStat st = UmountPartitions(0ms); if (dumpUmountDebugInfo) { if ((st != UMOUNT_STAT_SUCCESS) && DUMP_ON_UMOUNT_FAILURE) DumpUmountDebuggingInfo(false); } }
我们这里是reboot,不走fsck,sync() before umount这个log也是看不到的。很有可能就是卡在umount里了?
又复现一次:
[ 92.971658] scm: secure world has been busy for 1 second! [ 94.331640] scm_call failed: func id 0x7300fa01, ret: -12, syscall returns: 0x0, 0x0, 0x0 [ 97.454185] init: Terminating running services took 6023ms with remaining services:3 [ 97.454293] init: Sending signal 9 to service 'gx_fpd' (pid 4944) process group... [ 97.680210] libprocessgroup: Failed to kill process cgroup uid 1000 pid 4944 in 225ms, 1 processes remain [ 97.680392] init: Sending signal 9 to service 'trustonic-daemon' (pid 3315) process group... [ 97.905801] libprocessgroup: Failed to kill process cgroup uid 1000 pid 3315 in 225ms, 1 processes remain [ 97.905980] init: Sending signal 9 to service 'vendor.per_mgr' (pid 561) process group... [ 97.914897] libprocessgroup: Successfully killed process cgroup uid 1000 pid 561 in 8ms [ 97.915339] init: Service 'vendor.per_mgr' (pid 561) received signal 9 [ 97.951903] vdc: Waited 0ms for vold [ 97.952039] binder: 483:487 transaction failed 29189/-22, size 88-0 line 3017 [ 101.601745] Trustonic TEE: wait_mcp_notification: No answer after 10s
这里我们加大过timeout,仍然不起作用,有个gx_fpd杀不掉?ps看下:
xxx:/ # ps -A | grep fpd system 4944 1 19768 6844 __qseecom_process_incomplete_cmd 0 S gx_fpd fpd xxx:/ # ps -A | grep trustonic-daemon xxx:/ # ps -A | grep 3315 system 3315 1 0 0 0 0 Z [mcDriverDaemon] xxx:/ # ps -A | grep Z USER PID PPID VSZ RSS WCHAN ADDR S NAME
system 3315 1 0 0 0 0 Z [mcDriverDaemon]
root 5094 1 0 0 0 0 Z [vdc]
Z? 都僵尸了?后来发现指纹没接,接上去重启压测上千次暂未复现。
还有复现的log,这个是用apk做的reboot,log都有限:
<14>[ 86.307908] init: Received sys.powerctl='reboot,' from pid: 1368 (system_server) <14>[ 86.308048] init: Clear action queue and start shutdown trigger <14>[ 86.308278] init: processing action (shutdown_done) from (<Builtin Action>:0) <14>[ 86.308300] init: Reboot start, reason: reboot,, rebootTarget:
Reland "init: Reboot after timeout passes during reboot"
We're experiencing issues where unmount is forcing the device to be stuck permanently, so it's better to have this shutdownt thread with a long timeout (30s) and a potential kernel panic if it fails than a stuck device.